// // Created by Haifa Bogdan Adnan on 06/08/2018. // #include "../../../common/common.h" #include "OpenCLKernel.h" string OpenCLKernel = R"OCL( #define THREADS_PER_LANE 32 #define BLOCK_SIZE_ULONG 128 #define BLOCK_SIZE_UINT 256 #define ARGON2_PREHASH_DIGEST_LENGTH_UINT 16 #define ARGON2_PREHASH_SEED_LENGTH_UINT 18 #define ARGON2_BLOCK_SIZE 1024 #define ARGON2_DWORDS_IN_BLOCK (ARGON2_BLOCK_SIZE / 4) #define BLAKE_SHARED_MEM_ULONG 76 #define ARGON2_RAW_LENGTH 8 #define ARGON2_TYPE_VALUE 2 #define ARGON2_VERSION 0x13 #define BLOCK_BYTES 32 #define OUT_BYTES 16 #ifdef USE_AMD_BITALIGN #pragma OPENCL EXTENSION cl_amd_media_ops : enable #define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : rotate((x), 64UL - (n))) #else #define rotr64(x, n) rotate((x), 64UL - (n)) #endif #define G(m, r, i, a, b, c, d) \ { \ a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ d = rotr64(d ^ a, 32); \ c = c + d; \ b = rotr64(b ^ c, 24); \ a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \ d = rotr64(d ^ a, 16); \ c = c + d; \ b = rotr64(b ^ c, 63); \ } #define G_S(m, a, b, c, d) \ { \ a = a + b + m; \ d = rotr64(d ^ a, 32); \ c = c + d; \ b = rotr64(b ^ c, 24); \ a = a + b + m; \ d = rotr64(d ^ a, 16); \ c = c + d; \ b = rotr64(b ^ c, 63); \ } #define ROUND(m, t, r, shfl) \ { \ G(m, r, t, v0, v1, v2, v3); \ shfl[t + 4] = v1; \ shfl[t + 8] = v2; \ shfl[t + 12] = v3; \ barrier(CLK_LOCAL_MEM_FENCE); \ v1 = shfl[((t + 1) % 4)+ 4]; \ v2 = shfl[((t + 2) % 4)+ 8]; \ v3 = shfl[((t + 3) % 4)+ 12]; \ G(m, r, (t + 4), v0, v1, v2, v3); \ shfl[((t + 1) % 4)+ 4] = v1; \ shfl[((t + 2) % 4)+ 8] = v2; \ shfl[((t + 3) % 4)+ 12] = v3; \ barrier(CLK_LOCAL_MEM_FENCE); \ v1 = shfl[t + 4]; \ v2 = shfl[t + 8]; \ v3 = shfl[t + 12]; \ } #define ROUND_S(m, t, shfl) \ { \ G_S(m, v0, v1, v2, v3); \ shfl[t + 4] = v1; \ shfl[t + 8] = v2; \ shfl[t + 12] = v3; \ barrier(CLK_LOCAL_MEM_FENCE); \ v1 = shfl[((t + 1) % 4)+ 4]; \ v2 = shfl[((t + 2) % 4)+ 8]; \ v3 = shfl[((t + 3) % 4)+ 12]; \ G_S(m, v0, v1, v2, v3); \ shfl[((t + 1) % 4)+ 4] = v1; \ shfl[((t + 2) % 4)+ 8] = v2; \ shfl[((t + 3) % 4)+ 12] = v3; \ barrier(CLK_LOCAL_MEM_FENCE); \ v1 = shfl[t + 4]; \ v2 = shfl[t + 8]; \ v3 = shfl[t + 12]; \ } __constant ulong blake2b_IV[8] = { 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, 0x510E527FADE682D1, 0x9B05688C2B3E6C1F, 0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179 }; __constant uint blake2b_sigma[12][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, {11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4}, {7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8}, {9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13}, {2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9}, {12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11}, {13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10}, {6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5}, {10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0}, {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, }; void blake2b_compress(__local ulong *h, __local ulong *m, ulong f0, __local ulong *shfl, int thr_id) { ulong v0, v1, v2, v3; barrier(CLK_LOCAL_MEM_FENCE); v0 = h[thr_id]; v1 = h[thr_id + 4]; v2 = blake2b_IV[thr_id]; v3 = blake2b_IV[thr_id + 4]; if(thr_id == 0) v3 ^= h[8]; if(thr_id == 1) v3 ^= h[9]; if(thr_id == 2) v3 ^= f0; ROUND(m, thr_id, 0, shfl); ROUND(m, thr_id, 1, shfl); ROUND(m, thr_id, 2, shfl); ROUND(m, thr_id, 3, shfl); ROUND(m, thr_id, 4, shfl); ROUND(m, thr_id, 5, shfl); ROUND(m, thr_id, 6, shfl); ROUND(m, thr_id, 7, shfl); ROUND(m, thr_id, 8, shfl); ROUND(m, thr_id, 9, shfl); ROUND(m, thr_id, 10, shfl); ROUND(m, thr_id, 11, shfl); h[thr_id] ^= v0 ^ v2; h[thr_id + 4] ^= v1 ^ v3; } void blake2b_compress_static(__local ulong *h, ulong m, ulong f0, __local ulong *shfl, int thr_id) { ulong v0, v1, v2, v3; barrier(CLK_LOCAL_MEM_FENCE); v0 = h[thr_id]; v1 = h[thr_id + 4]; v2 = blake2b_IV[thr_id]; v3 = blake2b_IV[thr_id + 4]; if(thr_id == 0) v3 ^= h[8]; if(thr_id == 1) v3 ^= h[9]; if(thr_id == 2) v3 ^= f0; ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); ROUND_S(m, thr_id, shfl); h[thr_id] ^= v0 ^ v2; h[thr_id + 4] ^= v1 ^ v3; } void blake2b_incrementCounter(__local ulong *h, int inc) { h[8] += (inc * 4); h[9] += (h[8] < (inc * 4)); } void blake2b_final_global(__global uint *out, int out_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id) { int left = BLOCK_BYTES - buf_len; __local uint *cursor_out_local = buf + buf_len; for(int i=0; i < (left >> 2); i++, cursor_out_local += 4) { cursor_out_local[thr_id] = 0; } if(thr_id == 0) { for (int i = 0; i < (left % 4); i++) { cursor_out_local[i] = 0; } blake2b_incrementCounter(h, buf_len); } blake2b_compress(h, (__local ulong *)buf, 0xFFFFFFFFFFFFFFFF, shfl, thr_id); __local uint *cursor_in = (__local uint *)h; __global uint *cursor_out_global = out; for(int i=0; i < (out_len >> 2); i++, cursor_in += 4, cursor_out_global += 4) { cursor_out_global[thr_id] = cursor_in[thr_id]; } if(thr_id == 0) { for (int i = 0; i < (out_len % 4); i++) { cursor_out_global[i] = cursor_in[i]; } } } void blake2b_final_local(__local uint *out, int out_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id) { int left = BLOCK_BYTES - buf_len; __local uint *cursor_out = buf + buf_len; for(int i=0; i < (left >> 2); i++, cursor_out += 4) { cursor_out[thr_id] = 0; } if(thr_id == 0) { for (int i = 0; i < (left % 4); i++) { cursor_out[i] = 0; } blake2b_incrementCounter(h, buf_len); } blake2b_compress(h, (__local ulong *)buf, 0xFFFFFFFFFFFFFFFF, shfl, thr_id); __local uint *cursor_in = (__local uint *)h; cursor_out = out; for(int i=0; i < (out_len >> 2); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } if(thr_id == 0) { for (int i = 0; i < (out_len % 4); i++) { cursor_out[i] = cursor_in[i]; } } } int blake2b_update_global(__global uint *in, int in_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id) { __global uint *cursor_in = in; __local uint *cursor_out = buf + buf_len; if (buf_len + in_len > BLOCK_BYTES) { int left = BLOCK_BYTES - buf_len; for(int i=0; i < (left >> 2); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } if(thr_id == 0) { for (int i = 0; i < (left % 4); i++) { cursor_out[i] = cursor_in[i]; } blake2b_incrementCounter(h, BLOCK_BYTES); } blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id); buf_len = 0; in_len -= left; in += left; while (in_len > BLOCK_BYTES) { if(thr_id == 0) blake2b_incrementCounter(h, BLOCK_BYTES); cursor_in = in; cursor_out = buf; for(int i=0; i < (BLOCK_BYTES / 4); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id); in_len -= BLOCK_BYTES; in += BLOCK_BYTES; } } cursor_in = in; cursor_out = buf + buf_len; for(int i=0; i < (in_len >> 2); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } if(thr_id == 0) { for (int i = 0; i < (in_len % 4); i++) { cursor_out[i] = cursor_in[i]; } } return buf_len + in_len; } int blake2b_update_static(uint in, int in_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id) { ulong in64 = in; in64 = in64 << 32; in64 = in64 | in; __local uint *cursor_out = buf + buf_len; if (buf_len + in_len > BLOCK_BYTES) { int left = BLOCK_BYTES - buf_len; for(int i=0; i < (left >> 2); i++, cursor_out += 4) { cursor_out[thr_id] = in; } if(thr_id == 0) { for (int i = 0; i < (left % 4); i++) { cursor_out[i] = in; } blake2b_incrementCounter(h, BLOCK_BYTES); } blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id); buf_len = 0; in_len -= left; while (in_len > BLOCK_BYTES) { if(thr_id == 0) blake2b_incrementCounter(h, BLOCK_BYTES); blake2b_compress_static(h, in64, 0, shfl, thr_id); in_len -= BLOCK_BYTES; } } cursor_out = buf + buf_len; for(int i=0; i < (in_len >> 2); i++, cursor_out += 4) { cursor_out[thr_id] = in; } if(thr_id == 0) { for (int i = 0; i < (in_len % 4); i++) { cursor_out[i] = in; } } return buf_len + in_len; } int blake2b_update_local(__local uint *in, int in_len, __local ulong *h, __local uint *buf, int buf_len, __local ulong *shfl, int thr_id) { __local uint *cursor_in = in; __local uint *cursor_out = buf + buf_len; if (buf_len + in_len > BLOCK_BYTES) { int left = BLOCK_BYTES - buf_len; for(int i=0; i < (left >> 2); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } if(thr_id == 0) { for (int i = 0; i < (left % 4); i++) { cursor_out[i] = cursor_in[i]; } blake2b_incrementCounter(h, BLOCK_BYTES); } blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id); buf_len = 0; in_len -= left; in += left; while (in_len > BLOCK_BYTES) { if(thr_id == 0) blake2b_incrementCounter(h, BLOCK_BYTES); cursor_in = in; cursor_out = buf; for(int i=0; i < (BLOCK_BYTES / 4); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } blake2b_compress(h, (__local ulong *)buf, 0, shfl, thr_id); in_len -= BLOCK_BYTES; in += BLOCK_BYTES; } } cursor_in = in; cursor_out = buf + buf_len; for(int i=0; i < (in_len >> 2); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } if(thr_id == 0) { for (int i = 0; i < (in_len % 4); i++) { cursor_out[i] = cursor_in[i]; } } return buf_len + in_len; } int blake2b_init(__local ulong *h, int out_len, int thr_id) { h[thr_id * 2] = blake2b_IV[thr_id * 2]; h[thr_id * 2 + 1] = blake2b_IV[thr_id * 2 + 1]; if(thr_id == 0) { h[8] = h[9] = 0; h[0] = 0x6A09E667F3BCC908 ^ ((out_len * 4) | (1 << 16) | (1 << 24)); } return 0; } void blake2b_digestLong_global(__global uint *out, int out_len, __global uint *in, int in_len, int thr_id, __local ulong* shared) { __local ulong *h = shared; __local ulong *shfl = &h[10]; __local uint *buf = (__local uint *)&shfl[16]; __local uint *out_buffer = &buf[32]; int buf_len; if(thr_id == 0) buf[0] = (out_len * 4); buf_len = 1; if (out_len <= OUT_BYTES) { blake2b_init(h, out_len, thr_id); buf_len = blake2b_update_global(in, in_len, h, buf, buf_len, shfl, thr_id); blake2b_final_global(out, out_len, h, buf, buf_len, shfl, thr_id); } else { __local uint *cursor_in = out_buffer; __global uint *cursor_out = out; blake2b_init(h, OUT_BYTES, thr_id); buf_len = blake2b_update_global(in, in_len, h, buf, buf_len, shfl, thr_id); blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } out += OUT_BYTES / 2; int to_produce = out_len - OUT_BYTES / 2; while (to_produce > OUT_BYTES) { buf_len = blake2b_init(h, OUT_BYTES, thr_id); buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); cursor_out = out; cursor_in = out_buffer; for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } out += OUT_BYTES / 2; to_produce -= OUT_BYTES / 2; } buf_len = blake2b_init(h, to_produce, thr_id); buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); blake2b_final_global(out, to_produce, h, buf, buf_len, shfl, thr_id); } } void blake2b_digestLong_local(__global uint *out, int out_len, __local uint *in, int in_len, int thr_id, __local ulong* shared) { __local ulong *h = shared; __local ulong *shfl = &h[10]; __local uint *buf = (__local uint *)&shfl[16]; __local uint *out_buffer = &buf[32]; int buf_len; if(thr_id == 0) buf[0] = (out_len * 4); buf_len = 1; if (out_len <= OUT_BYTES) { blake2b_init(h, out_len, thr_id); buf_len = blake2b_update_local(in, in_len, h, buf, buf_len, shfl, thr_id); blake2b_final_global(out, out_len, h, buf, buf_len, shfl, thr_id); } else { __local uint *cursor_in = out_buffer; __global uint *cursor_out = out; blake2b_init(h, OUT_BYTES, thr_id); buf_len = blake2b_update_local(in, in_len, h, buf, buf_len, shfl, thr_id); blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } out += OUT_BYTES / 2; int to_produce = out_len - OUT_BYTES / 2; while (to_produce > OUT_BYTES) { buf_len = blake2b_init(h, OUT_BYTES, thr_id); buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); blake2b_final_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); cursor_out = out; cursor_in = out_buffer; for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } out += OUT_BYTES / 2; to_produce -= OUT_BYTES / 2; } buf_len = blake2b_init(h, to_produce, thr_id); buf_len = blake2b_update_local(out_buffer, OUT_BYTES, h, buf, buf_len, shfl, thr_id); blake2b_final_global(out, to_produce, h, buf, buf_len, shfl, thr_id); } } #define fBlaMka(x, y) ((x) + (y) + 2 * upsample(mul_hi((uint)(x), (uint)(y)), (uint)(x) * (uint)y)) #define COMPUTE \ a = fBlaMka(a, b); \ d = rotr64(d ^ a, (ulong)32); \ c = fBlaMka(c, d); \ b = rotr64(b ^ c, (ulong)24); \ a = fBlaMka(a, b); \ d = rotr64(d ^ a, (ulong)16); \ c = fBlaMka(c, d); \ b = rotr64(b ^ c, (ulong)63); __constant char offsets_round_1[32][4] = { { 0, 4, 8, 12 }, { 1, 5, 9, 13 }, { 2, 6, 10, 14 }, { 3, 7, 11, 15 }, { 16, 20, 24, 28 }, { 17, 21, 25, 29 }, { 18, 22, 26, 30 }, { 19, 23, 27, 31 }, { 32, 36, 40, 44 }, { 33, 37, 41, 45 }, { 34, 38, 42, 46 }, { 35, 39, 43, 47 }, { 48, 52, 56, 60 }, { 49, 53, 57, 61 }, { 50, 54, 58, 62 }, { 51, 55, 59, 63 }, { 64, 68, 72, 76 }, { 65, 69, 73, 77 }, { 66, 70, 74, 78 }, { 67, 71, 75, 79 }, { 80, 84, 88, 92 }, { 81, 85, 89, 93 }, { 82, 86, 90, 94 }, { 83, 87, 91, 95 }, { 96, 100, 104, 108 }, { 97, 101, 105, 109 }, { 98, 102, 106, 110 }, { 99, 103, 107, 111 }, { 112, 116, 120, 124 }, { 113, 117, 121, 125 }, { 114, 118, 122, 126 }, { 115, 119, 123, 127 }, }; __constant char offsets_round_2[32][4] = { { 0, 5, 10, 15 }, { 1, 6, 11, 12 }, { 2, 7, 8, 13 }, { 3, 4, 9, 14 }, { 16, 21, 26, 31 }, { 17, 22, 27, 28 }, { 18, 23, 24, 29 }, { 19, 20, 25, 30 }, { 32, 37, 42, 47 }, { 33, 38, 43, 44 }, { 34, 39, 40, 45 }, { 35, 36, 41, 46 }, { 48, 53, 58, 63 }, { 49, 54, 59, 60 }, { 50, 55, 56, 61 }, { 51, 52, 57, 62 }, { 64, 69, 74, 79 }, { 65, 70, 75, 76 }, { 66, 71, 72, 77 }, { 67, 68, 73, 78 }, { 80, 85, 90, 95 }, { 81, 86, 91, 92 }, { 82, 87, 88, 93 }, { 83, 84, 89, 94 }, { 96, 101, 106, 111 }, { 97, 102, 107, 108 }, { 98, 103, 104, 109 }, { 99, 100, 105, 110 }, { 112, 117, 122, 127 }, { 113, 118, 123, 124 }, { 114, 119, 120, 125 }, { 115, 116, 121, 126 }, }; __constant char offsets_round_3[32][4] = { { 0, 32, 64, 96 }, { 1, 33, 65, 97 }, { 16, 48, 80, 112 }, { 17, 49, 81, 113 }, { 2, 34, 66, 98 }, { 3, 35, 67, 99 }, { 18, 50, 82, 114 }, { 19, 51, 83, 115 }, { 4, 36, 68, 100 }, { 5, 37, 69, 101 }, { 20, 52, 84, 116 }, { 21, 53, 85, 117 }, { 6, 38, 70, 102 }, { 7, 39, 71, 103 }, { 22, 54, 86, 118 }, { 23, 55, 87, 119 }, { 8, 40, 72, 104 }, { 9, 41, 73, 105 }, { 24, 56, 88, 120 }, { 25, 57, 89, 121 }, { 10, 42, 74, 106 }, { 11, 43, 75, 107 }, { 26, 58, 90, 122 }, { 27, 59, 91, 123 }, { 12, 44, 76, 108 }, { 13, 45, 77, 109 }, { 28, 60, 92, 124 }, { 29, 61, 93, 125 }, { 14, 46, 78, 110 }, { 15, 47, 79, 111 }, { 30, 62, 94, 126 }, { 31, 63, 95, 127 }, }; __constant char offsets_round_4[32][4] = { { 0, 33, 80, 113 }, { 1, 48, 81, 96 }, { 16, 49, 64, 97 }, { 17, 32, 65, 112 }, { 2, 35, 82, 115 }, { 3, 50, 83, 98 }, { 18, 51, 66, 99 }, { 19, 34, 67, 114 }, { 4, 37, 84, 117 }, { 5, 52, 85, 100 }, { 20, 53, 68, 101 }, { 21, 36, 69, 116 }, { 6, 39, 86, 119 }, { 7, 54, 87, 102 }, { 22, 55, 70, 103 }, { 23, 38, 71, 118 }, { 8, 41, 88, 121 }, { 9, 56, 89, 104 }, { 24, 57, 72, 105 }, { 25, 40, 73, 120 }, { 10, 43, 90, 123 }, { 11, 58, 91, 106 }, { 26, 59, 74, 107 }, { 27, 42, 75, 122 }, { 12, 45, 92, 125 }, { 13, 60, 93, 108 }, { 28, 61, 76, 109 }, { 29, 44, 77, 124 }, { 14, 47, 94, 127 }, { 15, 62, 95, 110 }, { 30, 63, 78, 111 }, { 31, 46, 79, 126 }, }; #define G1(data) \ { \ barrier(CLK_LOCAL_MEM_FENCE); \ a = data[i1_0]; \ b = data[i1_1]; \ c = data[i1_2]; \ d = data[i1_3]; \ COMPUTE \ data[i1_1] = b; \ data[i1_2] = c; \ data[i1_3] = d; \ barrier(CLK_LOCAL_MEM_FENCE); \ } #define G2(data) \ { \ b = data[i2_1]; \ c = data[i2_2]; \ d = data[i2_3]; \ COMPUTE \ data[i2_0] = a; \ data[i2_1] = b; \ data[i2_2] = c; \ data[i2_3] = d; \ barrier(CLK_LOCAL_MEM_FENCE); \ } #define G3(data) \ { \ a = data[i3_0]; \ b = data[i3_1]; \ c = data[i3_2]; \ d = data[i3_3]; \ COMPUTE \ data[i3_1] = b; \ data[i3_2] = c; \ data[i3_3] = d; \ barrier(CLK_LOCAL_MEM_FENCE); \ } #define G4(data) \ { \ b = data[i4_1]; \ c = data[i4_2]; \ d = data[i4_3]; \ COMPUTE \ data[i4_0] = a; \ data[i4_1] = b; \ data[i4_2] = c; \ data[i4_3] = d; \ barrier(CLK_LOCAL_MEM_FENCE); \ } __kernel void fill_blocks(__global ulong *chunk_0, __global ulong *chunk_1, __global ulong *chunk_2, __global ulong *chunk_3, __global ulong *chunk_4, __global ulong *chunk_5, __global ulong *seed, __global ulong *out, __global uint *refs, __global uint *idxs, __global uint *segments, int memsize, int lanes, int seg_length, int seg_count, int threads_per_chunk, __local ulong *scratchpad) { // lanes * BLOCK_SIZE_ULONG ulong4 tmp; ulong a, b, c, d; int hash_base = get_group_id(0) * 2; int local_id = get_local_id(0); int hash_idx = (local_id / THREADS_PER_LANE) % 2; int wave_id = local_id % (THREADS_PER_LANE * 2); int id = wave_id % THREADS_PER_LANE; int lane = local_id / (THREADS_PER_LANE * 2); int lane_length = seg_length * 4; int hash = hash_base + hash_idx; ulong chunks[6]; chunks[0] = (ulong)chunk_0; chunks[1] = (ulong)chunk_1; chunks[2] = (ulong)chunk_2; chunks[3] = (ulong)chunk_3; chunks[4] = (ulong)chunk_4; chunks[5] = (ulong)chunk_5; int chunk_index = hash_base / threads_per_chunk; int chunk_offset = hash_base - chunk_index * threads_per_chunk; __global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8); int i1_0 = offsets_round_1[id][0]; int i1_1 = offsets_round_1[id][1]; int i1_2 = offsets_round_1[id][2]; int i1_3 = offsets_round_1[id][3]; int i2_0 = offsets_round_2[id][0]; int i2_1 = offsets_round_2[id][1]; int i2_2 = offsets_round_2[id][2]; int i2_3 = offsets_round_2[id][3]; int i3_0 = offsets_round_3[id][0]; int i3_1 = offsets_round_3[id][1]; int i3_2 = offsets_round_3[id][2]; int i3_3 = offsets_round_3[id][3]; int i4_0 = offsets_round_4[id][0]; int i4_1 = offsets_round_4[id][1]; int i4_2 = offsets_round_4[id][2]; int i4_3 = offsets_round_4[id][3]; __global ulong *seed_mem = seed + hash * lanes * 2 * BLOCK_SIZE_ULONG + lane * 2 * BLOCK_SIZE_ULONG; __global ulong *seed_dst = memory + (lane * lane_length * 2 + hash_idx) * BLOCK_SIZE_ULONG; vstore4(vload4(id, seed_mem), id, seed_dst); seed_mem += BLOCK_SIZE_ULONG; seed_dst += (2 * BLOCK_SIZE_ULONG); vstore4(vload4(id, seed_mem), id, seed_dst); __global ulong *next_block; __global ulong *prev_block; __global uint *seg_refs; __global uint *seg_idxs; __local ulong *state = scratchpad + (lane * 2 + hash_idx) * BLOCK_SIZE_ULONG; segments += (lane * 3); for(int s=0; s < (seg_count / lanes); s++) { int idx = ((s == 0) ? 2 : 0); // index for first slice in each lane is 2 int with_xor = ((s >= 4) ? 1 : 0); int keep = 1; int slice = s % 4; int pass = s / 4; __global int *cur_seg = &segments[s * lanes * 3]; int cur_idx = cur_seg[0]; int prev_idx = cur_seg[1]; int seg_type = cur_seg[2]; int ref_idx = 0; ulong4 ref = 0, next = 0; prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG; tmp = vload4(wave_id, prev_block); if(seg_type == 0) { seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); ref_idx = seg_refs[0]; if(idxs != 0) { seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); cur_idx = seg_idxs[0]; } ulong4 nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); for (int i=0;idx < seg_length;i++, idx++) { next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) next = vload4(wave_id, next_block); ref = nextref; if (idx < seg_length - 1) { ref_idx = seg_refs[i + 1]; if(idxs != 0) { keep = cur_idx & 0x80000000; cur_idx = seg_idxs[i + 1]; } else cur_idx++; nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); } tmp ^= ref; vstore4(tmp, id, state); G1(state); G2(state); G3(state); G4(state); if(with_xor == 1) tmp ^= next; tmp ^= vload4(id, state); if(keep > 0) { vstore4(tmp, wave_id, next_block); barrier(CLK_GLOBAL_MEM_FENCE); } } } else { vstore4(tmp, id, state); barrier(CLK_LOCAL_MEM_FENCE); for (int i=0;idx < seg_length;i++, idx++, cur_idx++) { ulong pseudo_rand = state[0]; if(lanes == 1) { uint reference_area_size = 0; if(pass > 0) { reference_area_size = lane_length - seg_length + idx - 1; } else { reference_area_size = slice * seg_length + idx - 1; // seg_length } ulong relative_position = pseudo_rand & 0xFFFFFFFF; relative_position = (relative_position * relative_position) >> 32; relative_position = reference_area_size - 1 - ((reference_area_size * relative_position) >> 32); ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; } else { ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost uint reference_area_size = 0; if(pass > 0) { if (lane == ref_lane) { reference_area_size = lane_length - seg_length + idx - 1; } else { reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); } } else { if (lane == ref_lane) { reference_area_size = slice * seg_length + idx - 1; // seg_length } else { reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); } } ulong relative_position = pseudo_rand & 0xFFFFFFFF; relative_position = (relative_position * relative_position) >> 32; relative_position = reference_area_size - 1 - ((reference_area_size * relative_position) >> 32); ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; } ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) next = vload4(wave_id, next_block); tmp ^= ref; vstore4(tmp, id, state); G1(state); G2(state); G3(state); G4(state); if(with_xor == 1) tmp ^= next; tmp ^= vload4(id, state); vstore4(tmp, id, state); vstore4(tmp, wave_id, next_block); barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); } } } vstore4(tmp, id, state); barrier(CLK_LOCAL_MEM_FENCE); if(lane == 0) { // first lane needs to acumulate results __global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG; for(int l=1; l> 2); i++, cursor_in += 4, cursor_out += 4) { cursor_out[thr_id] = cursor_in[thr_id]; } if(thr_id == 0) { for (int i = 0; i < (pwdlen % 4); i++) { cursor_out[i] = cursor_in[i]; } uint nonce = (preseed[9] >> 24) | (preseed[10] << 8); nonce += hash; local_preseed[9] = (preseed[9] & 0x00FFFFFF) | (nonce << 24); local_preseed[10] = (preseed[10] & 0xFF000000) | (nonce >> 8); } int buf_len = blake2b_init(h, ARGON2_PREHASH_DIGEST_LENGTH_UINT, thr_id); *value = lanes; //lanes buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); *value = 32; //outlen buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); *value = memsz; //m_cost buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); *value = passes; //t_cost buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); *value = ARGON2_VERSION; //version buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); *value = ARGON2_TYPE_VALUE; //type buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); *value = pwdlen * 4; //pw_len buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); buf_len = blake2b_update_local(local_preseed, pwdlen, h, buf, buf_len, shfl, thr_id); *value = saltlen * 4; //salt_len buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); buf_len = blake2b_update_local(local_preseed, saltlen, h, buf, buf_len, shfl, thr_id); *value = 0; //secret_len buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); buf_len = blake2b_update_local(0, 0, h, buf, buf_len, shfl, thr_id); *value = 0; //ad_len buf_len = blake2b_update_local(value, 1, h, buf, buf_len, shfl, thr_id); buf_len = blake2b_update_local(0, 0, h, buf, buf_len, shfl, thr_id); blake2b_final_local(local_mem, ARGON2_PREHASH_DIGEST_LENGTH_UINT, h, buf, buf_len, shfl, thr_id); if (thr_id == 0) { local_mem[ARGON2_PREHASH_DIGEST_LENGTH_UINT] = idx; local_mem[ARGON2_PREHASH_DIGEST_LENGTH_UINT + 1] = lane; } blake2b_digestLong_local(local_seed, ARGON2_DWORDS_IN_BLOCK, local_mem, ARGON2_PREHASH_SEED_LENGTH_UINT, thr_id, (__local ulong *)&local_mem[20]); } } __kernel void posthash ( __global uint *hash, __global uint *out, __global uint *preseed, __local ulong *blake_shared) { int hash_id = get_group_id(0); int thread = get_local_id(0); __global uint *local_hash = hash + hash_id * (ARGON2_RAW_LENGTH + 1); __global uint *local_out = out + hash_id * BLOCK_SIZE_UINT; blake2b_digestLong_global(local_hash, ARGON2_RAW_LENGTH, local_out, ARGON2_DWORDS_IN_BLOCK, thread, blake_shared); if(thread == 0) { uint nonce = (preseed[9] >> 24) | (preseed[10] << 8); nonce += hash_id; local_hash[ARGON2_RAW_LENGTH] = nonce; } } )OCL";