From bfb397f217682dd704c5d81bae748780d54ea78c Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sat, 7 Sep 2019 22:39:57 +0300 Subject: [PATCH] Optimizations to OpenCL kernel - added optional support for amd_bitalign, added special case for lanes = 1 in index calculation for argon2d. --- .../hash/gpu/cuda/cuda_kernel.cu | 58 ++++++++----- .../hash/gpu/opencl/OpenCLHasher.cpp | 23 +++++- .../hash/gpu/opencl/OpenCLHasher.h | 1 + .../hash/gpu/opencl/OpenCLKernel.cpp | 82 ++++++++++++------- 4 files changed, 113 insertions(+), 51 deletions(-) diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu index b9bad2b0..88e71b21 100644 --- a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu +++ b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu @@ -589,29 +589,47 @@ __global__ void fill_blocks(uint32_t *scratchpad0, } uint32_t pseudo_rand_lo = __shfl_sync(0xffffffff, tmp_a.x, 0); - uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0); - uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost - uint32_t reference_area_size = 0; - if(pass > 0) { - if (lane == ref_lane) { - reference_area_size = lane_length - seg_length + idx - 1; - } else { - reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); - } - } + if(lanes > 1) { + uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0); + + uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost + uint32_t reference_area_size = 0; + if (pass > 0) { + if (lane == ref_lane) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); + } + } else { + if (lane == ref_lane) { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } else { + reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); + } + } + asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size)); + + uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo; + + ref_idx = ref_lane * lane_length + + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % + lane_length; + } else { - if (lane == ref_lane) { - reference_area_size = slice * seg_length + idx - 1; // seg_length - } else { - reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); - } + uint32_t reference_area_size = 0; + if (pass > 0) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } + asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size)); + + uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo; + + ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % + lane_length; } - asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size)); - - uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo; - - ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; ref_block = memory + ref_idx * BLOCK_SIZE_UINT4; diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp index e1ce9400..af682fbe 100755 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp @@ -196,6 +196,24 @@ OpenCLDeviceInfo *OpenCLHasher::getDeviceInfo(cl_platform_id platform, cl_device device_info->deviceString = device_vendor + " - " + device_name/* + " : " + device_version*/; + string extensions; + sz = 0; + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &sz); + buffer = (char *)malloc(sz + 1); + device_info->error = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sz, buffer, &sz); + if(device_info->error != CL_SUCCESS) { + free(buffer); + device_info->errorMessage = "Error querying device extensions."; + return device_info; + } + else { + buffer[sz] = 0; + extensions = buffer; + free(buffer); + } + + device_info->deviceExtensions = extensions; + device_info->error = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_info->maxMemSize), &(device_info->maxMemSize), NULL); if(device_info->error != CL_SUCCESS) { device_info->errorMessage = "Error querying device global memory size."; @@ -362,7 +380,10 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) { return false; } - error = clBuildProgram(device->program, 1, &device->device, "", NULL, NULL); + string options = ""; + if(device->deviceExtensions.find("cl_amd_media_ops") != string::npos) + options += "-D USE_AMD_BITALIGN"; + error = clBuildProgram(device->program, 1, &device->device, options.c_str(), NULL, NULL); if (error != CL_SUCCESS) { size_t log_size; clGetProgramBuildInfo(device->program, device->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h index 7b4472fd..3b688272 100755 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h @@ -64,6 +64,7 @@ struct OpenCLDeviceInfo { Argon2ProfileInfo profileInfo; string deviceString; + string deviceExtensions; uint64_t maxMemSize; uint64_t maxAllocableMemSize; diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index 0841d8cf..f360189f 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -26,6 +26,14 @@ string OpenCLKernel = R"OCL( #define BLOCK_BYTES 32 #define OUT_BYTES 16 +#ifdef USE_AMD_BITALIGN +#pragma OPENCL EXTENSION cl_amd_media_ops : enable + +#define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : rotate((x), 64UL - (n))) +#else +#define rotr64(x, n) rotate((x), 64UL - (n)) +#endif + #define G(m, r, i, a, b, c, d) \ { \ a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ @@ -90,11 +98,6 @@ string OpenCLKernel = R"OCL( v3 = shfl[t + 12]; \ } -ulong rotr64(ulong x, ulong n) -{ - return rotate(x, 64 - n); -} - __constant ulong blake2b_IV[8] = { 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, @@ -547,13 +550,13 @@ void blake2b_digestLong_local(__global uint *out, int out_len, #define COMPUTE \ a = fBlaMka(a, b); \ - d = rotate(d ^ a, (ulong)32); \ + d = rotr64(d ^ a, (ulong)32); \ c = fBlaMka(c, d); \ - b = rotate(b ^ c, (ulong)40); \ + b = rotr64(b ^ c, (ulong)24); \ a = fBlaMka(a, b); \ - d = rotate(d ^ a, (ulong)48); \ + d = rotr64(d ^ a, (ulong)16); \ c = fBlaMka(c, d); \ - b = rotate(b ^ c, (ulong)1); + b = rotr64(b ^ c, (ulong)63); __constant char offsets_round_1[32][4] = { { 0, 4, 8, 12 }, @@ -905,31 +908,50 @@ __kernel void fill_blocks(__global ulong *chunk_0, for (int i=0;idx < seg_length;i++, idx++, cur_idx++) { ulong pseudo_rand = state[0]; - ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost - uint reference_area_size = 0; + if(lanes == 1) { + uint reference_area_size = 0; - if(pass > 0) { - if (lane == ref_lane) { - reference_area_size = lane_length - seg_length + idx - 1; - } else { - reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); - } - } - else { - if (lane == ref_lane) { - reference_area_size = slice * seg_length + idx - 1; // seg_length - } else { - reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); - } - } + if(pass > 0) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } - ulong relative_position = pseudo_rand & 0xFFFFFFFF; - relative_position = (relative_position * relative_position) >> 32; + ulong relative_position = pseudo_rand & 0xFFFFFFFF; + relative_position = (relative_position * relative_position) >> 32; - relative_position = reference_area_size - 1 - - ((reference_area_size * relative_position) >> 32); + relative_position = reference_area_size - 1 - + ((reference_area_size * relative_position) >> 32); - ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; + ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; + } + else { + ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost + uint reference_area_size = 0; + + if(pass > 0) { + if (lane == ref_lane) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); + } + } + else { + if (lane == ref_lane) { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } else { + reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); + } + } + + ulong relative_position = pseudo_rand & 0xFFFFFFFF; + relative_position = (relative_position * relative_position) >> 32; + + relative_position = reference_area_size - 1 - + ((reference_area_size * relative_position) >> 32); + + ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; + } ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);