REDACTED-rig/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp

1105 lines
33 KiB
C++

//
// Created by Haifa Bogdan Adnan on 06/08/2018.
//
#include "../../../common/common.h"
#include "OpenCLKernel.h"
string OpenCLKernel = R"OCL(
#define THREADS_PER_LANE 32
#define BLOCK_SIZE_ULONG 128
#define BLOCK_SIZE_UINT 256
#define ARGON2_PREHASH_DIGEST_LENGTH_UINT 16
#define ARGON2_PREHASH_SEED_LENGTH_UINT 18
#define ARGON2_BLOCK_SIZE 1024
#define ARGON2_DWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 4)
#define BLAKE_SHARED_MEM_ULONG 76
#define ARGON2_RAW_LENGTH 8
#define ARGON2_TYPE_VALUE 2
#define ARGON2_VERSION 0x13
#define BLOCK_BYTES 32
#define OUT_BYTES 16
#ifdef USE_AMD_BITALIGN
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : rotate((x), 64UL - (n)))
#else
#define rotr64(x, n) rotate((x), 64UL - (n))
#endif
#define G(m, r, i, a, b, c, d) \
{ \
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
}
#define G_S(m, a, b, c, d) \
{ \
a = a + b + m; \
d = rotr64(d ^ a, 32); \
c = c + d; \
b = rotr64(b ^ c, 24); \
a = a + b + m; \
d = rotr64(d ^ a, 16); \
c = c + d; \
b = rotr64(b ^ c, 63); \
}
#define ROUND(m, t, r, shfl) \
{ \
G(m, r, t, v0, v1, v2, v3); \
shfl[t + 4] = v1; \
shfl[t + 8] = v2; \
shfl[t + 12] = v3; \
barrier(CLK_LOCAL_MEM_FENCE); \
v1 = shfl[((t + 1) % 4)+ 4]; \
v2 = shfl[((t + 2) % 4)+ 8]; \
v3 = shfl[((t + 3) % 4)+ 12]; \
G(m, r, (t + 4), v0, v1, v2, v3); \
shfl[((t + 1) % 4)+ 4] = v1; \
shfl[((t + 2) % 4)+ 8] = v2; \
shfl[((t + 3) % 4)+ 12] = v3; \
barrier(CLK_LOCAL_MEM_FENCE); \
v1 = shfl[t + 4]; \
v2 = shfl[t + 8]; \
v3 = shfl[t + 12]; \
}
#define ROUND_S(m, t, shfl) \
{ \
G_S(m, v0, v1, v2, v3); \
shfl[t + 4] = v1; \
shfl[t + 8] = v2; \
shfl[t + 12] = v3; \
barrier(CLK_LOCAL_MEM_FENCE); \
v1 = shfl[((t + 1) % 4)+ 4]; \
v2 = shfl[((t + 2) % 4)+ 8]; \
v3 = shfl[((t + 3) % 4)+ 12]; \
G_S(m, v0, v1, v2, v3); \
shfl[((t + 1) % 4)+ 4] = v1; \
shfl[((t + 2) % 4)+ 8] = v2; \
shfl[((t + 3) % 4)+ 12] = v3; \
barrier(CLK_LOCAL_MEM_FENCE); \
v1 = shfl[t + 4]; \
v2 = shfl[t + 8]; \
v3 = shfl[t + 12]; \
}
__constant ulong blake2b_IV[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
};
__constant uint blake2b_sigma[12][16] = {
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
};
void blake2b_compress(__local ulong *h, __local ulong *m, ulong f0, __local ulong *shfl, int thr_id)
{
ulong v0, v1, v2, v3;
barrier(CLK_LOCAL_MEM_FENCE);
v0 = h[thr_id];
v1 = h[thr_id + 4];
v2 = blake2b_IV[thr_id];
v3 = blake2b_IV[thr_id + 4];
if(thr_id == 0) v3 ^= h[8];
if(thr_id == 1) v3 ^= h[9];
if(thr_id == 2) v3 ^= f0;
ROUND(m, thr_id, 0, shfl);
ROUND(m, thr_id, 1, shfl);
ROUND(m, thr_id, 2, shfl);
ROUND(m, thr_id, 3, shfl);
ROUND(m, thr_id, 4, shfl);
ROUND(m, thr_id, 5, shfl);
ROUND(m, thr_id, 6, shfl);
ROUND(m, thr_id, 7, shfl);
ROUND(m, thr_id, 8, shfl);
ROUND(m, thr_id, 9, shfl);
ROUND(m, thr_id, 10, shfl);
ROUND(m, thr_id, 11, shfl);
h[thr_id] ^= v0 ^ v2;
h[thr_id + 4] ^= v1 ^ v3;
}
void blake2b_compress_static(__local ulong *h, ulong m, ulong f0, __local ulong *shfl, int thr_id)
{
ulong v0, v1, v2, v3;
barrier(CLK_LOCAL_MEM_FENCE);
v0 = h[thr_id];
v1 = h[thr_id + 4];
v2 = blake2b_IV[thr_id];
v3 = blake2b_IV[thr_id + 4];
if(thr_id == 0) v3 ^= h[8];
if(thr_id == 1) v3 ^= h[9];
if(thr_id == 2) v3 ^= f0;
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
ROUND_S(m, thr_id, shfl);
h[thr_id] ^= v0 ^ v2;
h[thr_id + 4] ^= v1 ^ v3;
}
void blake2b_incrementCounter(__local ulong *h, int inc)
{
h[8] += (inc * 4);
h[9] += (h[8] < (inc * 4));
}
void blake2b_final_global(__global uint *out, int out_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id)
{
int left = BLOCK_BYTES - buf_len;
__local uint *cursor_out_local = buf + buf_len;
for(int i=0; i < (left >> 2); i++, cursor_out_local += 4) {
cursor_out_local[thr_id] = 0;
}
if(thr_id == 0) {
for (int i = 0; i < (left % 4); i++) {
cursor_out_local[i] = 0;
}
blake2b_incrementCounter(h, buf_len);
}
blake2b_compress(h, (__local ulong *)buf, 0xFFFFFFFFFFFFFFFF, shfl, thr_id);
__local uint *cursor_in = (__local uint *)h;
__global uint *cursor_out_global = out;
for(int i=0; i < (out_len >> 2); i++, cursor_in += 4, cursor_out_global += 4) {
cursor_out_global[thr_id] = cursor_in[thr_id];
}
if(thr_id == 0) {
for (int i = 0; i < (out_len % 4); i++) {
cursor_out_global[i] = cursor_in[i];
}
}
}
void blake2b_final_local(__local uint *out, int out_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id)
{
int left = BLOCK_BYTES - buf_len;
__local uint *cursor_out = buf + buf_len;
for(int i=0; i < (left >> 2); i++, cursor_out += 4) {
cursor_out[thr_id] = 0;
}
if(thr_id == 0) {
for (int i = 0; i < (left % 4); i++) {
cursor_out[i] = 0;
}
blake2b_incrementCounter(h, buf_len);
}
blake2b_compress(h, (__local ulong *)buf, 0xFFFFFFFFFFFFFFFF, shfl, thr_id);
__local uint *cursor_in = (__local uint *)h;
cursor_out = out;
for(int i=0; i < (out_len >> 2); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
if(thr_id == 0) {
for (int i = 0; i < (out_len % 4); i++) {
cursor_out[i] = cursor_in[i];
}
}
}
int blake2b_update_global(__global uint *in, int in_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id)
{
__global uint *cursor_in = in;
__local uint *cursor_out = buf + buf_len;
if (buf_len + in_len > BLOCK_BYTES) {
int left = BLOCK_BYTES - buf_len;
for(int i=0; i < (left >> 2); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
if(thr_id == 0) {
for (int i = 0; i < (left % 4); i++) {
cursor_out[i] = cursor_in[i];
}
blake2b_incrementCounter(h, BLOCK_BYTES);
}
blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id);
buf_len = 0;
in_len -= left;
in += left;
while (in_len > BLOCK_BYTES) {
if(thr_id == 0)
blake2b_incrementCounter(h, BLOCK_BYTES);
cursor_in = in;
cursor_out = buf;
for(int i=0; i < (BLOCK_BYTES / 4); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id);
in_len -= BLOCK_BYTES;
in += BLOCK_BYTES;
}
}
cursor_in = in;
cursor_out = buf + buf_len;
for(int i=0; i < (in_len >> 2); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
if(thr_id == 0) {
for (int i = 0; i < (in_len % 4); i++) {
cursor_out[i] = cursor_in[i];
}
}
return buf_len + in_len;
}
int blake2b_update_static(uint in, int in_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id)
{
ulong in64 = in;
in64 = in64 << 32;
in64 = in64 | in;
__local uint *cursor_out = buf + buf_len;
if (buf_len + in_len > BLOCK_BYTES) {
int left = BLOCK_BYTES - buf_len;
for(int i=0; i < (left >> 2); i++, cursor_out += 4) {
cursor_out[thr_id] = in;
}
if(thr_id == 0) {
for (int i = 0; i < (left % 4); i++) {
cursor_out[i] = in;
}
blake2b_incrementCounter(h, BLOCK_BYTES);
}
blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id);
buf_len = 0;
in_len -= left;
while (in_len > BLOCK_BYTES) {
if(thr_id == 0)
blake2b_incrementCounter(h, BLOCK_BYTES);
blake2b_compress_static(h, in64, 0, shfl, thr_id);
in_len -= BLOCK_BYTES;
}
}
cursor_out = buf + buf_len;
for(int i=0; i < (in_len >> 2); i++, cursor_out += 4) {
cursor_out[thr_id] = in;
}
if(thr_id == 0) {
for (int i = 0; i < (in_len % 4); i++) {
cursor_out[i] = in;
}
}
return buf_len + in_len;
}
int blake2b_update_local(__local uint *in, int in_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id)
{
__local uint *cursor_in = in;
__local uint *cursor_out = buf + buf_len;
if (buf_len + in_len > BLOCK_BYTES) {
int left = BLOCK_BYTES - buf_len;
for(int i=0; i < (left >> 2); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
if(thr_id == 0) {
for (int i = 0; i < (left % 4); i++) {
cursor_out[i] = cursor_in[i];
}
blake2b_incrementCounter(h, BLOCK_BYTES);
}
blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id);
buf_len = 0;
in_len -= left;
in += left;
while (in_len > BLOCK_BYTES) {
if(thr_id == 0)
blake2b_incrementCounter(h, BLOCK_BYTES);
cursor_in = in;
cursor_out = buf;
for(int i=0; i < (BLOCK_BYTES / 4); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id);
in_len -= BLOCK_BYTES;
in += BLOCK_BYTES;
}
}
cursor_in = in;
cursor_out = buf + buf_len;
for(int i=0; i < (in_len >> 2); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
if(thr_id == 0) {
for (int i = 0; i < (in_len % 4); i++) {
cursor_out[i] = cursor_in[i];
}
}
return buf_len + in_len;
}
int blake2b_init(__local ulong *h, int out_len, int thr_id)
{
h[thr_id * 2] = blake2b_IV[thr_id * 2];
h[thr_id * 2 + 1] = blake2b_IV[thr_id * 2 + 1];
if(thr_id == 0) {
h[8] = h[9] = 0;
h[0] = 0x6A09E667F3BCC908 ^ ((out_len * 4) | (1 << 16) | (1 << 24));
}
return 0;
}
void blake2b_digestLong_global(__global uint *out, int out_len,
__global uint *in, int in_len,
int thr_id, __local ulong* shared)
{
__local ulong *h = shared;
__local ulong *shfl = &h[10];
__local uint *buf = (__local uint *)&shfl[16];
__local uint *out_buffer = &buf[32];
int buf_len;
if(thr_id == 0) buf[0] = (out_len * 4);
buf_len = 1;
if (out_len <= OUT_BYTES) {
blake2b_init(h, out_len, thr_id);
buf_len = blake2b_update_global(in, in_len, h, buf, buf_len, shfl, thr_id);
blake2b_final_global(out, out_len, h, buf, buf_len, shfl, thr_id);
} else {
__local uint *cursor_in = out_buffer;
__global uint *cursor_out = out;
blake2b_init(h, OUT_BYTES, thr_id);
buf_len = blake2b_update_global(in, in_len, h, buf, buf_len, shfl, thr_id);
blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
out += OUT_BYTES / 2;
int to_produce = out_len - OUT_BYTES / 2;
while (to_produce > OUT_BYTES) {
buf_len = blake2b_init(h, OUT_BYTES, thr_id);
buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
cursor_out = out;
cursor_in = out_buffer;
for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
out += OUT_BYTES / 2;
to_produce -= OUT_BYTES / 2;
}
buf_len = blake2b_init(h, to_produce, thr_id);
buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
blake2b_final_global(out, to_produce, h, buf, buf_len, shfl, thr_id);
}
}
void blake2b_digestLong_local(__global uint *out, int out_len,
__local uint *in, int in_len,
int thr_id, __local ulong* shared)
{
__local ulong *h = shared;
__local ulong *shfl = &h[10];
__local uint *buf = (__local uint *)&shfl[16];
__local uint *out_buffer = &buf[32];
int buf_len;
if(thr_id == 0) buf[0] = (out_len * 4);
buf_len = 1;
if (out_len <= OUT_BYTES) {
blake2b_init(h, out_len, thr_id);
buf_len = blake2b_update_local(in, in_len, h, buf, buf_len, shfl, thr_id);
blake2b_final_global(out, out_len, h, buf, buf_len, shfl, thr_id);
} else {
__local uint *cursor_in = out_buffer;
__global uint *cursor_out = out;
blake2b_init(h, OUT_BYTES, thr_id);
buf_len = blake2b_update_local(in, in_len, h, buf, buf_len, shfl, thr_id);
blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
out += OUT_BYTES / 2;
int to_produce = out_len - OUT_BYTES / 2;
while (to_produce > OUT_BYTES) {
buf_len = blake2b_init(h, OUT_BYTES, thr_id);
buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
cursor_out = out;
cursor_in = out_buffer;
for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
out += OUT_BYTES / 2;
to_produce -= OUT_BYTES / 2;
}
buf_len = blake2b_init(h, to_produce, thr_id);
buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id);
blake2b_final_global(out, to_produce, h, buf, buf_len, shfl, thr_id);
}
}
#define fBlaMka(x, y) ((x) + (y) + 2 * upsample(mul_hi((uint)(x), (uint)(y)), (uint)(x) * (uint)y))
#define COMPUTE \
a = fBlaMka(a, b); \
d = rotr64(d ^ a, (ulong)32); \
c = fBlaMka(c, d); \
b = rotr64(b ^ c, (ulong)24); \
a = fBlaMka(a, b); \
d = rotr64(d ^ a, (ulong)16); \
c = fBlaMka(c, d); \
b = rotr64(b ^ c, (ulong)63);
__constant char offsets_round_1[32][4] = {
{ 0, 4, 8, 12 },
{ 1, 5, 9, 13 },
{ 2, 6, 10, 14 },
{ 3, 7, 11, 15 },
{ 16, 20, 24, 28 },
{ 17, 21, 25, 29 },
{ 18, 22, 26, 30 },
{ 19, 23, 27, 31 },
{ 32, 36, 40, 44 },
{ 33, 37, 41, 45 },
{ 34, 38, 42, 46 },
{ 35, 39, 43, 47 },
{ 48, 52, 56, 60 },
{ 49, 53, 57, 61 },
{ 50, 54, 58, 62 },
{ 51, 55, 59, 63 },
{ 64, 68, 72, 76 },
{ 65, 69, 73, 77 },
{ 66, 70, 74, 78 },
{ 67, 71, 75, 79 },
{ 80, 84, 88, 92 },
{ 81, 85, 89, 93 },
{ 82, 86, 90, 94 },
{ 83, 87, 91, 95 },
{ 96, 100, 104, 108 },
{ 97, 101, 105, 109 },
{ 98, 102, 106, 110 },
{ 99, 103, 107, 111 },
{ 112, 116, 120, 124 },
{ 113, 117, 121, 125 },
{ 114, 118, 122, 126 },
{ 115, 119, 123, 127 },
};
__constant char offsets_round_2[32][4] = {
{ 0, 5, 10, 15 },
{ 1, 6, 11, 12 },
{ 2, 7, 8, 13 },
{ 3, 4, 9, 14 },
{ 16, 21, 26, 31 },
{ 17, 22, 27, 28 },
{ 18, 23, 24, 29 },
{ 19, 20, 25, 30 },
{ 32, 37, 42, 47 },
{ 33, 38, 43, 44 },
{ 34, 39, 40, 45 },
{ 35, 36, 41, 46 },
{ 48, 53, 58, 63 },
{ 49, 54, 59, 60 },
{ 50, 55, 56, 61 },
{ 51, 52, 57, 62 },
{ 64, 69, 74, 79 },
{ 65, 70, 75, 76 },
{ 66, 71, 72, 77 },
{ 67, 68, 73, 78 },
{ 80, 85, 90, 95 },
{ 81, 86, 91, 92 },
{ 82, 87, 88, 93 },
{ 83, 84, 89, 94 },
{ 96, 101, 106, 111 },
{ 97, 102, 107, 108 },
{ 98, 103, 104, 109 },
{ 99, 100, 105, 110 },
{ 112, 117, 122, 127 },
{ 113, 118, 123, 124 },
{ 114, 119, 120, 125 },
{ 115, 116, 121, 126 },
};
__constant char offsets_round_3[32][4] = {
{ 0, 32, 64, 96 },
{ 1, 33, 65, 97 },
{ 16, 48, 80, 112 },
{ 17, 49, 81, 113 },
{ 2, 34, 66, 98 },
{ 3, 35, 67, 99 },
{ 18, 50, 82, 114 },
{ 19, 51, 83, 115 },
{ 4, 36, 68, 100 },
{ 5, 37, 69, 101 },
{ 20, 52, 84, 116 },
{ 21, 53, 85, 117 },
{ 6, 38, 70, 102 },
{ 7, 39, 71, 103 },
{ 22, 54, 86, 118 },
{ 23, 55, 87, 119 },
{ 8, 40, 72, 104 },
{ 9, 41, 73, 105 },
{ 24, 56, 88, 120 },
{ 25, 57, 89, 121 },
{ 10, 42, 74, 106 },
{ 11, 43, 75, 107 },
{ 26, 58, 90, 122 },
{ 27, 59, 91, 123 },
{ 12, 44, 76, 108 },
{ 13, 45, 77, 109 },
{ 28, 60, 92, 124 },
{ 29, 61, 93, 125 },
{ 14, 46, 78, 110 },
{ 15, 47, 79, 111 },
{ 30, 62, 94, 126 },
{ 31, 63, 95, 127 },
};
__constant char offsets_round_4[32][4] = {
{ 0, 33, 80, 113 },
{ 1, 48, 81, 96 },
{ 16, 49, 64, 97 },
{ 17, 32, 65, 112 },
{ 2, 35, 82, 115 },
{ 3, 50, 83, 98 },
{ 18, 51, 66, 99 },
{ 19, 34, 67, 114 },
{ 4, 37, 84, 117 },
{ 5, 52, 85, 100 },
{ 20, 53, 68, 101 },
{ 21, 36, 69, 116 },
{ 6, 39, 86, 119 },
{ 7, 54, 87, 102 },
{ 22, 55, 70, 103 },
{ 23, 38, 71, 118 },
{ 8, 41, 88, 121 },
{ 9, 56, 89, 104 },
{ 24, 57, 72, 105 },
{ 25, 40, 73, 120 },
{ 10, 43, 90, 123 },
{ 11, 58, 91, 106 },
{ 26, 59, 74, 107 },
{ 27, 42, 75, 122 },
{ 12, 45, 92, 125 },
{ 13, 60, 93, 108 },
{ 28, 61, 76, 109 },
{ 29, 44, 77, 124 },
{ 14, 47, 94, 127 },
{ 15, 62, 95, 110 },
{ 30, 63, 78, 111 },
{ 31, 46, 79, 126 },
};
#define G1(data) \
{ \
barrier(CLK_LOCAL_MEM_FENCE); \
a = data[i1_0]; \
b = data[i1_1]; \
c = data[i1_2]; \
d = data[i1_3]; \
COMPUTE \
data[i1_1] = b; \
data[i1_2] = c; \
data[i1_3] = d; \
barrier(CLK_LOCAL_MEM_FENCE); \
}
#define G2(data) \
{ \
b = data[i2_1]; \
c = data[i2_2]; \
d = data[i2_3]; \
COMPUTE \
data[i2_0] = a; \
data[i2_1] = b; \
data[i2_2] = c; \
data[i2_3] = d; \
barrier(CLK_LOCAL_MEM_FENCE); \
}
#define G3(data) \
{ \
a = data[i3_0]; \
b = data[i3_1]; \
c = data[i3_2]; \
d = data[i3_3]; \
COMPUTE \
data[i3_1] = b; \
data[i3_2] = c; \
data[i3_3] = d; \
barrier(CLK_LOCAL_MEM_FENCE); \
}
#define G4(data) \
{ \
b = data[i4_1]; \
c = data[i4_2]; \
d = data[i4_3]; \
COMPUTE \
data[i4_0] = a; \
data[i4_1] = b; \
data[i4_2] = c; \
data[i4_3] = d; \
barrier(CLK_LOCAL_MEM_FENCE); \
}
__kernel void fill_blocks(__global ulong *chunk_0,
__global ulong *chunk_1,
__global ulong *chunk_2,
__global ulong *chunk_3,
__global ulong *chunk_4,
__global ulong *chunk_5,
__global ulong *seed,
__global ulong *out,
__global uint *refs,
__global uint *idxs,
__global uint *segments,
int memsize,
int lanes,
int seg_length,
int seg_count,
int threads_per_chunk,
int thread_idx,
__local ulong *scratchpad) { // lanes * BLOCK_SIZE_ULONG
ulong4 tmp;
ulong a, b, c, d;
int hash_base = get_group_id(0) * 2;
int mem_hash = hash_base + thread_idx;
int local_id = get_local_id(0);
int hash_idx = (local_id / THREADS_PER_LANE) % 2;
int wave_id = local_id % (THREADS_PER_LANE * 2);
int id = wave_id % THREADS_PER_LANE;
int lane = local_id / (THREADS_PER_LANE * 2);
int lane_length = seg_length * 4;
int hash = hash_base + hash_idx;
ulong chunks[6];
chunks[0] = (ulong)chunk_0;
chunks[1] = (ulong)chunk_1;
chunks[2] = (ulong)chunk_2;
chunks[3] = (ulong)chunk_3;
chunks[4] = (ulong)chunk_4;
chunks[5] = (ulong)chunk_5;
int chunk_index = mem_hash / threads_per_chunk;
int chunk_offset = mem_hash - chunk_index * threads_per_chunk;
__global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8);
int i1_0 = offsets_round_1[id][0];
int i1_1 = offsets_round_1[id][1];
int i1_2 = offsets_round_1[id][2];
int i1_3 = offsets_round_1[id][3];
int i2_0 = offsets_round_2[id][0];
int i2_1 = offsets_round_2[id][1];
int i2_2 = offsets_round_2[id][2];
int i2_3 = offsets_round_2[id][3];
int i3_0 = offsets_round_3[id][0];
int i3_1 = offsets_round_3[id][1];
int i3_2 = offsets_round_3[id][2];
int i3_3 = offsets_round_3[id][3];
int i4_0 = offsets_round_4[id][0];
int i4_1 = offsets_round_4[id][1];
int i4_2 = offsets_round_4[id][2];
int i4_3 = offsets_round_4[id][3];
__global ulong *seed_mem = seed + hash * lanes * 2 * BLOCK_SIZE_ULONG + lane * 2 * BLOCK_SIZE_ULONG;
__global ulong *seed_dst = memory + (lane * lane_length * 2 + hash_idx) * BLOCK_SIZE_ULONG;
vstore4(vload4(id, seed_mem), id, seed_dst);
seed_mem += BLOCK_SIZE_ULONG;
seed_dst += (2 * BLOCK_SIZE_ULONG);
vstore4(vload4(id, seed_mem), id, seed_dst);
__global ulong *next_block;
__global ulong *prev_block;
__global uint *seg_refs;
__global uint *seg_idxs;
__local ulong *state = scratchpad + (lane * 2 + hash_idx) * BLOCK_SIZE_ULONG;
segments += (lane * 3);
for(int s=0; s < (seg_count / lanes); s++) {
int idx = ((s == 0) ? 2 : 0); // index for first slice in each lane is 2
int with_xor = ((s >= 4) ? 1 : 0);
int keep = 1;
int slice = s % 4;
int pass = s / 4;
__global int *cur_seg = &segments[s * lanes * 3];
int cur_idx = cur_seg[0];
int prev_idx = cur_seg[1];
int seg_type = cur_seg[2];
int ref_idx = 0;
prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG;
tmp = vload4(wave_id, prev_block);
if(seg_type == 0) {
seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
ref_idx = seg_refs[0];
prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
if(idxs != 0) {
seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
cur_idx = seg_idxs[0];
}
for (int i=0;idx < seg_length;i++, idx++) {
next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG;
if(with_xor == 1)
prefetch(next_block, BLOCK_SIZE_ULONG);
tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
if (idx < seg_length - 1) {
ref_idx = seg_refs[i + 1];
prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
if(idxs != 0) {
keep = cur_idx & 0x80000000;
cur_idx = seg_idxs[i + 1];
}
else
cur_idx++;
}
vstore4(tmp, id, state);
G1(state);
G2(state);
G3(state);
G4(state);
if(with_xor == 1)
tmp ^= vload4(wave_id, next_block);
tmp ^= vload4(id, state);
if(keep > 0) {
vstore4(tmp, wave_id, next_block);
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
}
else {
vstore4(tmp, id, state);
barrier(CLK_LOCAL_MEM_FENCE);
for (int i=0;idx < seg_length;i++, idx++, cur_idx++) {
next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG;
if(with_xor == 1)
prefetch(next_block, BLOCK_SIZE_ULONG);
ulong pseudo_rand = state[0];
if(lanes == 1) {
uint reference_area_size = 0;
if(pass > 0) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = slice * seg_length + idx - 1; // seg_length
}
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
relative_position = (relative_position * relative_position) >> 32;
relative_position = reference_area_size - 1 -
((reference_area_size * relative_position) >> 32);
ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
}
else {
ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost
uint reference_area_size = 0;
if(pass > 0) {
if (lane == ref_lane) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
}
}
else {
if (lane == ref_lane) {
reference_area_size = slice * seg_length + idx - 1; // seg_length
} else {
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
}
}
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
relative_position = (relative_position * relative_position) >> 32;
relative_position = reference_area_size - 1 -
((reference_area_size * relative_position) >> 32);
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
}
tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
vstore4(tmp, id, state);
G1(state);
G2(state);
G3(state);
G4(state);
if(with_xor == 1)
tmp ^= vload4(wave_id, next_block);
tmp ^= vload4(id, state);
vstore4(tmp, id, state);
vstore4(tmp, wave_id, next_block);
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
}
}
}
vstore4(tmp, id, state);
barrier(CLK_LOCAL_MEM_FENCE);
if(lane == 0) { // first lane needs to acumulate results
__global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG;
for(int l=1; l<lanes; l++)
tmp ^= vload4(id, scratchpad + (l * 2 + hash_idx) * BLOCK_SIZE_ULONG);
vstore4(tmp, id, out_mem);
}
};
__kernel void prehash (
__global uint *preseed,
__global uint *seed,
int memsz,
int lanes,
int passes,
int pwdlen,
int saltlen,
int threads,
__local ulong *blake_shared) {
int seeds_batch_size = get_local_size(0) / 4; // number of seeds per block
int hash_batch_size = seeds_batch_size / (lanes * 2); // number of hashes per block
int id = get_local_id(0); // minimum 64 threads
int thr_id = id % 4; // thread id in session
int session = id / 4; // blake2b hashing session
int hash = get_group_id(0) * hash_batch_size;
int hash_idx = session / (lanes * 2);
hash += hash_idx;
if(hash < threads) {
int hash_session = session % (lanes * 2); // session in hash
int lane = hash_session / 2; // 2 seeds per lane
int idx = hash_session % 2; // seed idx in lane
__local uint *local_mem = (__local uint *)&blake_shared[session * BLAKE_SHARED_MEM_ULONG];
__global uint *local_seed = seed + (hash * lanes * 2 + hash_session) * BLOCK_SIZE_UINT;
__local ulong *h = (__local ulong *)&local_mem[20];
__local ulong *shfl = &h[10];
__local uint *buf = (__local uint *)&shfl[16];
__local uint *value = &buf[32];
__local uint *local_preseed = &value[1];
__global uint *cursor_in = preseed;
__local uint *cursor_out = local_preseed;
for(int i=0; i < (pwdlen >> 2); i++, cursor_in += 4, cursor_out += 4) {
cursor_out[thr_id] = cursor_in[thr_id];
}
if(thr_id == 0) {
for (int i = 0; i < (pwdlen % 4); i++) {
cursor_out[i] = cursor_in[i];
}
uint nonce = (preseed[9] >> 24) | (preseed[10] << 8);
nonce += hash;
local_preseed[9] = (preseed[9] & 0x00FFFFFF) | (nonce << 24);
local_preseed[10] = (preseed[10] & 0xFF000000) | (nonce >> 8);
}
int buf_len = blake2b_init(h, ARGON2_PREHASH_DIGEST_LENGTH_UINT, thr_id);
*value = lanes; //lanes
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
*value = 32; //outlen
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
*value = memsz; //m_cost
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
*value = passes; //t_cost
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
*value = ARGON2_VERSION; //version
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
*value = ARGON2_TYPE_VALUE; //type
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
*value = pwdlen * 4; //pw_len
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
buf_len = blake2b_update_local(local_preseed, pwdlen, h, buf, buf_len, shfl, thr_id);
*value = saltlen * 4; //salt_len
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
buf_len = blake2b_update_local(local_preseed, saltlen, h, buf, buf_len, shfl, thr_id);
*value = 0; //secret_len
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
buf_len = blake2b_update_local(0, 0, h, buf, buf_len, shfl, thr_id);
*value = 0; //ad_len
buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id);
buf_len = blake2b_update_local(0, 0, h, buf, buf_len, shfl, thr_id);
blake2b_final_local(local_mem, ARGON2_PREHASH_DIGEST_LENGTH_UINT, h, buf, buf_len, shfl, thr_id);
if (thr_id == 0) {
local_mem[ARGON2_PREHASH_DIGEST_LENGTH_UINT] = idx;
local_mem[ARGON2_PREHASH_DIGEST_LENGTH_UINT + 1] = lane;
}
blake2b_digestLong_local(local_seed, ARGON2_DWORDS_IN_BLOCK, local_mem, ARGON2_PREHASH_SEED_LENGTH_UINT, thr_id, (__local ulong *)&local_mem[20]);
}
}
__kernel void posthash (
__global uint *hash,
__global uint *out,
__global uint *preseed,
__local ulong *blake_shared) {
int hash_id = get_group_id(0);
int thread = get_local_id(0);
__global uint *local_hash = hash + hash_id * (ARGON2_RAW_LENGTH + 1);
__global uint *local_out = out + hash_id * BLOCK_SIZE_UINT;
blake2b_digestLong_global(local_hash, ARGON2_RAW_LENGTH, local_out, ARGON2_DWORDS_IN_BLOCK, thread, blake_shared);
if(thread == 0) {
uint nonce = (preseed[9] >> 24) | (preseed[10] << 8);
nonce += hash_id;
local_hash[ARGON2_RAW_LENGTH] = nonce;
}
}
)OCL";