From bfb397f217682dd704c5d81bae748780d54ea78c Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sat, 7 Sep 2019 22:39:57 +0300 Subject: [PATCH 1/8] Optimizations to OpenCL kernel - added optional support for amd_bitalign, added special case for lanes = 1 in index calculation for argon2d. --- .../hash/gpu/cuda/cuda_kernel.cu | 58 ++++++++----- .../hash/gpu/opencl/OpenCLHasher.cpp | 23 +++++- .../hash/gpu/opencl/OpenCLHasher.h | 1 + .../hash/gpu/opencl/OpenCLKernel.cpp | 82 ++++++++++++------- 4 files changed, 113 insertions(+), 51 deletions(-) diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu index b9bad2b0..88e71b21 100644 --- a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu +++ b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu @@ -589,29 +589,47 @@ __global__ void fill_blocks(uint32_t *scratchpad0, } uint32_t pseudo_rand_lo = __shfl_sync(0xffffffff, tmp_a.x, 0); - uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0); - uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost - uint32_t reference_area_size = 0; - if(pass > 0) { - if (lane == ref_lane) { - reference_area_size = lane_length - seg_length + idx - 1; - } else { - reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); - } - } + if(lanes > 1) { + uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0); + + uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost + uint32_t reference_area_size = 0; + if (pass > 0) { + if (lane == ref_lane) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); + } + } else { + if (lane == ref_lane) { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } else { + reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); + } + } + asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size)); + + uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo; + + ref_idx = ref_lane * lane_length + + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % + lane_length; + } else { - if (lane == ref_lane) { - reference_area_size = slice * seg_length + idx - 1; // seg_length - } else { - reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); - } + uint32_t reference_area_size = 0; + if (pass > 0) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } + asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size)); + + uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo; + + ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % + lane_length; } - asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size)); - - uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo; - - ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; ref_block = memory + ref_idx * BLOCK_SIZE_UINT4; diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp index e1ce9400..af682fbe 100755 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp @@ -196,6 +196,24 @@ OpenCLDeviceInfo *OpenCLHasher::getDeviceInfo(cl_platform_id platform, cl_device device_info->deviceString = device_vendor + " - " + device_name/* + " : " + device_version*/; + string extensions; + sz = 0; + clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &sz); + buffer = (char *)malloc(sz + 1); + device_info->error = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sz, buffer, &sz); + if(device_info->error != CL_SUCCESS) { + free(buffer); + device_info->errorMessage = "Error querying device extensions."; + return device_info; + } + else { + buffer[sz] = 0; + extensions = buffer; + free(buffer); + } + + device_info->deviceExtensions = extensions; + device_info->error = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_info->maxMemSize), &(device_info->maxMemSize), NULL); if(device_info->error != CL_SUCCESS) { device_info->errorMessage = "Error querying device global memory size."; @@ -362,7 +380,10 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) { return false; } - error = clBuildProgram(device->program, 1, &device->device, "", NULL, NULL); + string options = ""; + if(device->deviceExtensions.find("cl_amd_media_ops") != string::npos) + options += "-D USE_AMD_BITALIGN"; + error = clBuildProgram(device->program, 1, &device->device, options.c_str(), NULL, NULL); if (error != CL_SUCCESS) { size_t log_size; clGetProgramBuildInfo(device->program, device->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h index 7b4472fd..3b688272 100755 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h @@ -64,6 +64,7 @@ struct OpenCLDeviceInfo { Argon2ProfileInfo profileInfo; string deviceString; + string deviceExtensions; uint64_t maxMemSize; uint64_t maxAllocableMemSize; diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index 0841d8cf..f360189f 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -26,6 +26,14 @@ string OpenCLKernel = R"OCL( #define BLOCK_BYTES 32 #define OUT_BYTES 16 +#ifdef USE_AMD_BITALIGN +#pragma OPENCL EXTENSION cl_amd_media_ops : enable + +#define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : rotate((x), 64UL - (n))) +#else +#define rotr64(x, n) rotate((x), 64UL - (n)) +#endif + #define G(m, r, i, a, b, c, d) \ { \ a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \ @@ -90,11 +98,6 @@ string OpenCLKernel = R"OCL( v3 = shfl[t + 12]; \ } -ulong rotr64(ulong x, ulong n) -{ - return rotate(x, 64 - n); -} - __constant ulong blake2b_IV[8] = { 0x6A09E667F3BCC908, 0xBB67AE8584CAA73B, 0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1, @@ -547,13 +550,13 @@ void blake2b_digestLong_local(__global uint *out, int out_len, #define COMPUTE \ a = fBlaMka(a, b); \ - d = rotate(d ^ a, (ulong)32); \ + d = rotr64(d ^ a, (ulong)32); \ c = fBlaMka(c, d); \ - b = rotate(b ^ c, (ulong)40); \ + b = rotr64(b ^ c, (ulong)24); \ a = fBlaMka(a, b); \ - d = rotate(d ^ a, (ulong)48); \ + d = rotr64(d ^ a, (ulong)16); \ c = fBlaMka(c, d); \ - b = rotate(b ^ c, (ulong)1); + b = rotr64(b ^ c, (ulong)63); __constant char offsets_round_1[32][4] = { { 0, 4, 8, 12 }, @@ -905,31 +908,50 @@ __kernel void fill_blocks(__global ulong *chunk_0, for (int i=0;idx < seg_length;i++, idx++, cur_idx++) { ulong pseudo_rand = state[0]; - ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost - uint reference_area_size = 0; + if(lanes == 1) { + uint reference_area_size = 0; - if(pass > 0) { - if (lane == ref_lane) { - reference_area_size = lane_length - seg_length + idx - 1; - } else { - reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); - } - } - else { - if (lane == ref_lane) { - reference_area_size = slice * seg_length + idx - 1; // seg_length - } else { - reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); - } - } + if(pass > 0) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } - ulong relative_position = pseudo_rand & 0xFFFFFFFF; - relative_position = (relative_position * relative_position) >> 32; + ulong relative_position = pseudo_rand & 0xFFFFFFFF; + relative_position = (relative_position * relative_position) >> 32; - relative_position = reference_area_size - 1 - - ((reference_area_size * relative_position) >> 32); + relative_position = reference_area_size - 1 - + ((reference_area_size * relative_position) >> 32); - ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; + ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; + } + else { + ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost + uint reference_area_size = 0; + + if(pass > 0) { + if (lane == ref_lane) { + reference_area_size = lane_length - seg_length + idx - 1; + } else { + reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0); + } + } + else { + if (lane == ref_lane) { + reference_area_size = slice * seg_length + idx - 1; // seg_length + } else { + reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0); + } + } + + ulong relative_position = pseudo_rand & 0xFFFFFFFF; + relative_position = (relative_position * relative_position) >> 32; + + relative_position = reference_area_size - 1 - + ((reference_area_size * relative_position) >> 32); + + ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; + } ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG); From 4e0d75b611b8176778e4322365c1a6da961d8906 Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sat, 7 Sep 2019 23:28:37 +0300 Subject: [PATCH 2/8] Optimizations to OpenCL kernel - resize workgroup to 64 threads to fully use AMD wavefront size. --- .../hash/gpu/opencl/OpenCLHasher.cpp | 4 +- .../hash/gpu/opencl/OpenCLKernel.cpp | 47 ++++++++++--------- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp index af682fbe..d31bf69b 100755 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp @@ -760,9 +760,9 @@ void *opencl_kernel_filler(int threads, Argon2Profile *profile, void *user_data) cl_int error; size_t total_work_items = threads * KERNEL_WORKGROUP_SIZE * profile->thrCost; - size_t local_work_items = KERNEL_WORKGROUP_SIZE * profile->thrCost; + size_t local_work_items = 2 * KERNEL_WORKGROUP_SIZE * profile->thrCost; - size_t shared_mem = profile->thrCost * ARGON2_QWORDS_IN_BLOCK; + size_t shared_mem = 2 * profile->thrCost * ARGON2_QWORDS_IN_BLOCK; clSetKernelArg(device->kernelFillBlocks, 6, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]); clSetKernelArg(device->kernelFillBlocks, 7, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]); diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index f360189f..fad3b886 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -771,13 +771,17 @@ __kernel void fill_blocks(__global ulong *chunk_0, ulong4 tmp; ulong a, b, c, d; - int hash = get_group_id(0); + int hash_base = get_group_id(0) * 2; int local_id = get_local_id(0); - int id = local_id % THREADS_PER_LANE; - int lane = local_id / THREADS_PER_LANE; + int hash_idx = (local_id / THREADS_PER_LANE) % 2; + int wave_id = local_id % (THREADS_PER_LANE * 2); + int id = wave_id % THREADS_PER_LANE; + int lane = local_id / (THREADS_PER_LANE * 2); int lane_length = seg_length * 4; + int hash = hash_base + hash_idx; + ulong chunks[6]; chunks[0] = (ulong)chunk_0; chunks[1] = (ulong)chunk_1; @@ -785,8 +789,8 @@ __kernel void fill_blocks(__global ulong *chunk_0, chunks[3] = (ulong)chunk_3; chunks[4] = (ulong)chunk_4; chunks[5] = (ulong)chunk_5; - int chunk_index = hash / threads_per_chunk; - int chunk_offset = hash - chunk_index * threads_per_chunk; + int chunk_index = hash_base / threads_per_chunk; + int chunk_offset = hash_base - chunk_index * threads_per_chunk; __global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8); int i1_0 = offsets_round_1[id][0]; @@ -809,15 +813,13 @@ __kernel void fill_blocks(__global ulong *chunk_0, int i4_2 = offsets_round_4[id][2]; int i4_3 = offsets_round_4[id][3]; - __global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG; __global ulong *seed_mem = seed + hash * lanes * 2 * BLOCK_SIZE_ULONG + lane * 2 * BLOCK_SIZE_ULONG; - - __global ulong *seed_dst = memory + lane * lane_length * BLOCK_SIZE_ULONG; + __global ulong *seed_dst = memory + (lane * lane_length * 2 + hash_idx) * BLOCK_SIZE_ULONG; vstore4(vload4(id, seed_mem), id, seed_dst); seed_mem += BLOCK_SIZE_ULONG; - seed_dst += BLOCK_SIZE_ULONG; + seed_dst += (2 * BLOCK_SIZE_ULONG); vstore4(vload4(id, seed_mem), id, seed_dst); @@ -826,7 +828,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, __global uint *seg_refs; __global uint *seg_idxs; - __local ulong *state = scratchpad + lane * BLOCK_SIZE_ULONG; + __local ulong *state = scratchpad + (lane * 2 + hash_idx) * BLOCK_SIZE_ULONG; segments += (lane * 3); @@ -845,9 +847,9 @@ __kernel void fill_blocks(__global ulong *chunk_0, int ref_idx = 0; ulong4 ref = 0, next = 0; - prev_block = memory + prev_idx * BLOCK_SIZE_ULONG; + prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG; - tmp = vload4(id, prev_block); + tmp = vload4(wave_id, prev_block); if(seg_type == 0) { seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); @@ -858,13 +860,13 @@ __kernel void fill_blocks(__global ulong *chunk_0, cur_idx = seg_idxs[0]; } - ulong4 nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG); + ulong4 nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); for (int i=0;idx < seg_length;i++, idx++) { - next_block = memory + (cur_idx & 0x7FFFFFFF) * BLOCK_SIZE_ULONG; + next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) - next = vload4(id, next_block); + next = vload4(wave_id, next_block); ref = nextref; @@ -878,7 +880,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, else cur_idx++; - nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG); + nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); } tmp ^= ref; @@ -896,7 +898,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, tmp ^= vload4(id, state); if(keep > 0) { - vstore4(tmp, id, next_block); + vstore4(tmp, wave_id, next_block); barrier(CLK_GLOBAL_MEM_FENCE); } } @@ -953,12 +955,12 @@ __kernel void fill_blocks(__global ulong *chunk_0, ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; } - ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG); + ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); - next_block = memory + cur_idx * BLOCK_SIZE_ULONG; + next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) - next = vload4(id, next_block); + next = vload4(wave_id, next_block); tmp ^= ref; @@ -975,7 +977,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, tmp ^= vload4(id, state); vstore4(tmp, id, state); - vstore4(tmp, id, next_block); + vstore4(tmp, wave_id, next_block); barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE); } } @@ -985,8 +987,9 @@ __kernel void fill_blocks(__global ulong *chunk_0, barrier(CLK_LOCAL_MEM_FENCE); if(lane == 0) { // first lane needs to acumulate results + __global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG; for(int l=1; l Date: Sun, 8 Sep 2019 00:13:51 +0300 Subject: [PATCH 3/8] Support for optional double threading in OpenCL and CUDA. --- CMakeLists.txt | 7 +- .../hash/gpu/opencl/OpenCLHasher.cpp | 201 ++++++++++++------ .../hash/gpu/opencl/OpenCLHasher.h | 21 +- .../hash/gpu/opencl/OpenCLKernel.cpp | 6 +- 4 files changed, 159 insertions(+), 76 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index acb62108..60651d62 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,8 @@ option(WITH_TLS "Enable OpenSSL support" ON) option(WITH_EMBEDDED_CONFIG "Enable internal embedded JSON config" OFF) option(WITH_CUDA "Enable CUDA support" ON) option(WITH_OPENCL "Enable OpenCL support" ON) +option(WITH_OPENCL_DOUBLE_THREADS "Enable dual threads for OpenCL jobs" ON) +option(WITH_CUDA_DOUBLE_THREADS "Enable dual threads for CUDA jobs" ON) include (CheckIncludeFile) include (cmake/cpu.cmake) @@ -373,6 +375,9 @@ endif(ARCH STREQUAL "arm" OR ARCH STREQUAL "aarch64") if(WITH_OPENCL) add_definitions(-DWITH_OPENCL) find_package(OpenCL REQUIRED) + if(WITH_OPENCL_DOUBLE_THREADS) + add_definitions(-DPARALLEL_OPENCL) + endif() include_directories(${OpenCL_INCLUDE_DIR}) add_library(opencl_hasher MODULE ${SOURCE_OPENCL_HASHER}) set_target_properties(opencl_hasher @@ -388,7 +393,7 @@ endif() if(WITH_CUDA) add_definitions(-DWITH_CUDA) find_package(CUDA REQUIRED) - if(NOT WIN32) + if(WITH_CUDA_DOUBLE_THREADS) add_definitions(-DPARALLEL_CUDA) endif() set( diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp index d31bf69b..41d46c04 100755 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp @@ -363,7 +363,14 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) { return false; } - device->queue = clCreateCommandQueue(device->context, device->device, CL_QUEUE_PROFILING_ENABLE, &error); + device->queue[0] = clCreateCommandQueue(device->context, device->device, NULL, &error); + if (error != CL_SUCCESS) { + device->error = error; + device->errorMessage = "Error getting device command queue."; + return false; + } + + device->queue[1] = clCreateCommandQueue(device->context, device->device, NULL, &error); if (error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error getting device command queue."; @@ -398,26 +405,44 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) { return false; } - device->kernelPrehash = clCreateKernel(device->program, "prehash", &error); + device->kernelPrehash[0] = clCreateKernel(device->program, "prehash", &error); if (error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error creating opencl prehash kernel for device."; return false; } - device->kernelFillBlocks = clCreateKernel(device->program, "fill_blocks", &error); + device->kernelPrehash[1] = clCreateKernel(device->program, "prehash", &error); + if (error != CL_SUCCESS) { + device->error = error; + device->errorMessage = "Error creating opencl prehash kernel for device."; + return false; + } + device->kernelFillBlocks[0] = clCreateKernel(device->program, "fill_blocks", &error); if (error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error creating opencl main kernel for device."; return false; } - device->kernelPosthash = clCreateKernel(device->program, "posthash", &error); + device->kernelFillBlocks[1] = clCreateKernel(device->program, "fill_blocks", &error); + if (error != CL_SUCCESS) { + device->error = error; + device->errorMessage = "Error creating opencl main kernel for device."; + return false; + } + device->kernelPosthash[0] = clCreateKernel(device->program, "posthash", &error); + if (error != CL_SUCCESS) { + device->error = error; + device->errorMessage = "Error creating opencl posthash kernel for device."; + return false; + } + device->kernelPosthash[1] = clCreateKernel(device->program, "posthash", &error); if (error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error creating opencl posthash kernel for device."; return false; } - device->profileInfo.threads_per_chunk = (uint32_t) (device->maxAllocableMemSize / device->profileInfo.profile->memSize); + device->profileInfo.threads_per_chunk = ((uint32_t) (device->maxAllocableMemSize / device->profileInfo.profile->memSize) / 2) * 2; // make it divisible by 2 to allow 2 hashes per wavefront size_t chunk_size = device->profileInfo.threads_per_chunk * device->profileInfo.profile->memSize; if (chunk_size == 0) { @@ -650,7 +675,7 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) { refs[i] = device->profileInfo.profile->blockRefs[i * 3 + 1]; } - error=clEnqueueWriteBuffer(device->queue, device->arguments.refs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), refs, 0, NULL, NULL); + error=clEnqueueWriteBuffer(device->queue[0], device->arguments.refs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), refs, 0, NULL, NULL); if(error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error writing to gpu memory."; @@ -668,7 +693,7 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) { } } - error=clEnqueueWriteBuffer(device->queue, device->arguments.idxs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), idxs, 0, NULL, NULL); + error=clEnqueueWriteBuffer(device->queue[0], device->arguments.idxs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), idxs, 0, NULL, NULL); if(error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error writing to gpu memory."; @@ -678,37 +703,60 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) { free(idxs); } - error=clEnqueueWriteBuffer(device->queue, device->arguments.segments, CL_TRUE, 0, device->profileInfo.profile->segCount * 3 * sizeof(uint32_t), device->profileInfo.profile->segments, 0, NULL, NULL); + error=clEnqueueWriteBuffer(device->queue[0], device->arguments.segments, CL_TRUE, 0, device->profileInfo.profile->segCount * 3 * sizeof(uint32_t), device->profileInfo.profile->segments, 0, NULL, NULL); if(error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error writing to gpu memory."; return false; } - clSetKernelArg(device->kernelFillBlocks, 0, sizeof(device->arguments.memoryChunk_0), &device->arguments.memoryChunk_0); - clSetKernelArg(device->kernelFillBlocks, 1, sizeof(device->arguments.memoryChunk_1), &device->arguments.memoryChunk_1); - clSetKernelArg(device->kernelFillBlocks, 2, sizeof(device->arguments.memoryChunk_2), &device->arguments.memoryChunk_2); - clSetKernelArg(device->kernelFillBlocks, 3, sizeof(device->arguments.memoryChunk_3), &device->arguments.memoryChunk_3); - clSetKernelArg(device->kernelFillBlocks, 4, sizeof(device->arguments.memoryChunk_4), &device->arguments.memoryChunk_4); - clSetKernelArg(device->kernelFillBlocks, 5, sizeof(device->arguments.memoryChunk_5), &device->arguments.memoryChunk_5); - clSetKernelArg(device->kernelFillBlocks, 8, sizeof(device->arguments.refs), &device->arguments.refs); - if(device->profileInfo.profile->succesiveIdxs == 0) - clSetKernelArg(device->kernelFillBlocks, 9, sizeof(device->arguments.idxs), &device->arguments.idxs); - else - clSetKernelArg(device->kernelFillBlocks, 9, sizeof(cl_mem), NULL); - clSetKernelArg(device->kernelFillBlocks, 10, sizeof(device->arguments.segments), &device->arguments.segments); - clSetKernelArg(device->kernelFillBlocks, 11, sizeof(int32_t), &device->profileInfo.profile->memSize); - clSetKernelArg(device->kernelFillBlocks, 12, sizeof(int32_t), &device->profileInfo.profile->thrCost); - clSetKernelArg(device->kernelFillBlocks, 13, sizeof(int32_t), &device->profileInfo.profile->segSize); - clSetKernelArg(device->kernelFillBlocks, 14, sizeof(int32_t), &device->profileInfo.profile->segCount); - clSetKernelArg(device->kernelFillBlocks, 15, sizeof(int32_t), &device->profileInfo.threads_per_chunk); - - clSetKernelArg(device->kernelPrehash, 2, sizeof(int32_t), &device->profileInfo.profile->memCost); - clSetKernelArg(device->kernelPrehash, 3, sizeof(int32_t), &device->profileInfo.profile->thrCost); int passes = device->profileInfo.profile->segCount / (4 * device->profileInfo.profile->thrCost); - clSetKernelArg(device->kernelPrehash, 4, sizeof(int32_t), &passes); - clSetKernelArg(device->kernelPrehash, 6, sizeof(int32_t), &device->profileInfo.profile->saltLen); + clSetKernelArg(device->kernelFillBlocks[0], 0, sizeof(device->arguments.memoryChunk_0), &device->arguments.memoryChunk_0); + clSetKernelArg(device->kernelFillBlocks[0], 1, sizeof(device->arguments.memoryChunk_1), &device->arguments.memoryChunk_1); + clSetKernelArg(device->kernelFillBlocks[0], 2, sizeof(device->arguments.memoryChunk_2), &device->arguments.memoryChunk_2); + clSetKernelArg(device->kernelFillBlocks[0], 3, sizeof(device->arguments.memoryChunk_3), &device->arguments.memoryChunk_3); + clSetKernelArg(device->kernelFillBlocks[0], 4, sizeof(device->arguments.memoryChunk_4), &device->arguments.memoryChunk_4); + clSetKernelArg(device->kernelFillBlocks[0], 5, sizeof(device->arguments.memoryChunk_5), &device->arguments.memoryChunk_5); + clSetKernelArg(device->kernelFillBlocks[0], 8, sizeof(device->arguments.refs), &device->arguments.refs); + if(device->profileInfo.profile->succesiveIdxs == 0) + clSetKernelArg(device->kernelFillBlocks[0], 9, sizeof(device->arguments.idxs), &device->arguments.idxs); + else + clSetKernelArg(device->kernelFillBlocks[0], 9, sizeof(cl_mem), NULL); + clSetKernelArg(device->kernelFillBlocks[0], 10, sizeof(device->arguments.segments), &device->arguments.segments); + clSetKernelArg(device->kernelFillBlocks[0], 11, sizeof(int32_t), &device->profileInfo.profile->memSize); + clSetKernelArg(device->kernelFillBlocks[0], 12, sizeof(int32_t), &device->profileInfo.profile->thrCost); + clSetKernelArg(device->kernelFillBlocks[0], 13, sizeof(int32_t), &device->profileInfo.profile->segSize); + clSetKernelArg(device->kernelFillBlocks[0], 14, sizeof(int32_t), &device->profileInfo.profile->segCount); + clSetKernelArg(device->kernelFillBlocks[0], 15, sizeof(int32_t), &device->profileInfo.threads_per_chunk); + + clSetKernelArg(device->kernelPrehash[0], 2, sizeof(int32_t), &device->profileInfo.profile->memCost); + clSetKernelArg(device->kernelPrehash[0], 3, sizeof(int32_t), &device->profileInfo.profile->thrCost); + clSetKernelArg(device->kernelPrehash[0], 4, sizeof(int32_t), &passes); + clSetKernelArg(device->kernelPrehash[0], 6, sizeof(int32_t), &device->profileInfo.profile->saltLen); + + clSetKernelArg(device->kernelFillBlocks[1], 0, sizeof(device->arguments.memoryChunk_0), &device->arguments.memoryChunk_0); + clSetKernelArg(device->kernelFillBlocks[1], 1, sizeof(device->arguments.memoryChunk_1), &device->arguments.memoryChunk_1); + clSetKernelArg(device->kernelFillBlocks[1], 2, sizeof(device->arguments.memoryChunk_2), &device->arguments.memoryChunk_2); + clSetKernelArg(device->kernelFillBlocks[1], 3, sizeof(device->arguments.memoryChunk_3), &device->arguments.memoryChunk_3); + clSetKernelArg(device->kernelFillBlocks[1], 4, sizeof(device->arguments.memoryChunk_4), &device->arguments.memoryChunk_4); + clSetKernelArg(device->kernelFillBlocks[1], 5, sizeof(device->arguments.memoryChunk_5), &device->arguments.memoryChunk_5); + clSetKernelArg(device->kernelFillBlocks[1], 8, sizeof(device->arguments.refs), &device->arguments.refs); + if(device->profileInfo.profile->succesiveIdxs == 0) + clSetKernelArg(device->kernelFillBlocks[1], 9, sizeof(device->arguments.idxs), &device->arguments.idxs); + else + clSetKernelArg(device->kernelFillBlocks[1], 9, sizeof(cl_mem), NULL); + clSetKernelArg(device->kernelFillBlocks[1], 10, sizeof(device->arguments.segments), &device->arguments.segments); + clSetKernelArg(device->kernelFillBlocks[1], 11, sizeof(int32_t), &device->profileInfo.profile->memSize); + clSetKernelArg(device->kernelFillBlocks[1], 12, sizeof(int32_t), &device->profileInfo.profile->thrCost); + clSetKernelArg(device->kernelFillBlocks[1], 13, sizeof(int32_t), &device->profileInfo.profile->segSize); + clSetKernelArg(device->kernelFillBlocks[1], 14, sizeof(int32_t), &device->profileInfo.profile->segCount); + clSetKernelArg(device->kernelFillBlocks[1], 15, sizeof(int32_t), &device->profileInfo.threads_per_chunk); + + clSetKernelArg(device->kernelPrehash[1], 2, sizeof(int32_t), &device->profileInfo.profile->memCost); + clSetKernelArg(device->kernelPrehash[1], 3, sizeof(int32_t), &device->profileInfo.profile->thrCost); + clSetKernelArg(device->kernelPrehash[1], 4, sizeof(int32_t), &passes); + clSetKernelArg(device->kernelPrehash[1], 6, sizeof(int32_t), &device->profileInfo.profile->saltLen); return true; } @@ -724,29 +772,29 @@ bool opencl_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, size_t total_work_items = sessions * 4 * ceil(threads / hashes_per_block); size_t local_work_items = sessions * 4; - device->deviceLock.lock(); + gpumgmt_thread->lock(); - error = clEnqueueWriteBuffer(device->queue, device->arguments.preseedMemory[gpumgmt_thread->threadId], + error = clEnqueueWriteBuffer(device->queue[gpumgmt_thread->threadId], device->arguments.preseedMemory[gpumgmt_thread->threadId], CL_FALSE, 0, gpumgmt_thread->hashData.inSize, memory, 0, NULL, NULL); if (error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error writing to gpu memory."; - device->deviceLock.unlock(); + gpumgmt_thread->unlock(); return false; } int inSizeInInt = gpumgmt_thread->hashData.inSize / 4; - clSetKernelArg(device->kernelPrehash, 0, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]); - clSetKernelArg(device->kernelPrehash, 1, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]); - clSetKernelArg(device->kernelPrehash, 5, sizeof(int), &inSizeInInt); - clSetKernelArg(device->kernelPrehash, 7, sizeof(int), &threads); - clSetKernelArg(device->kernelPrehash, 8, sessions * sizeof(cl_ulong) * 76, NULL); // (preseed size is 16 ulongs = 128 bytes) + clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 0, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]); + clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 1, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]); + clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 5, sizeof(int), &inSizeInInt); + clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 7, sizeof(int), &threads); + clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 8, sessions * sizeof(cl_ulong) * 76, NULL); // (preseed size is 16 ulongs = 128 bytes) - error=clEnqueueNDRangeKernel(device->queue, device->kernelPrehash, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL); + error=clEnqueueNDRangeKernel(device->queue[gpumgmt_thread->threadId], device->kernelPrehash[gpumgmt_thread->threadId], 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL); if(error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error running the kernel."; - device->deviceLock.unlock(); + gpumgmt_thread->unlock(); return false; } @@ -764,15 +812,16 @@ void *opencl_kernel_filler(int threads, Argon2Profile *profile, void *user_data) size_t shared_mem = 2 * profile->thrCost * ARGON2_QWORDS_IN_BLOCK; - clSetKernelArg(device->kernelFillBlocks, 6, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]); - clSetKernelArg(device->kernelFillBlocks, 7, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]); - clSetKernelArg(device->kernelFillBlocks, 16, sizeof(cl_ulong) * shared_mem, NULL); + clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 6, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]); + clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 7, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]); + clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 16, sizeof(int), &(gpumgmt_thread->threadsIdx)); + clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 17, sizeof(cl_ulong) * shared_mem, NULL); - error=clEnqueueNDRangeKernel(device->queue, device->kernelFillBlocks, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL); + error=clEnqueueNDRangeKernel(device->queue[gpumgmt_thread->threadId], device->kernelFillBlocks[gpumgmt_thread->threadId], 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL); if(error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error running the kernel."; - device->deviceLock.unlock(); + gpumgmt_thread->unlock(); return NULL; } @@ -788,36 +837,36 @@ bool opencl_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, size_t total_work_items = threads * 4; size_t local_work_items = 4; - clSetKernelArg(device->kernelPosthash, 0, sizeof(device->arguments.hashMemory[gpumgmt_thread->threadId]), &device->arguments.hashMemory[gpumgmt_thread->threadId]); - clSetKernelArg(device->kernelPosthash, 1, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]); - clSetKernelArg(device->kernelPosthash, 2, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]); - clSetKernelArg(device->kernelPosthash, 3, sizeof(cl_ulong) * 60, NULL); + clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 0, sizeof(device->arguments.hashMemory[gpumgmt_thread->threadId]), &device->arguments.hashMemory[gpumgmt_thread->threadId]); + clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 1, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]); + clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 2, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]); + clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 3, sizeof(cl_ulong) * 60, NULL); - error=clEnqueueNDRangeKernel(device->queue, device->kernelPosthash, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL); + error=clEnqueueNDRangeKernel(device->queue[gpumgmt_thread->threadId], device->kernelPosthash[gpumgmt_thread->threadId], 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL); if(error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error running the kernel."; - device->deviceLock.unlock(); + gpumgmt_thread->unlock(); return false; } - error = clEnqueueReadBuffer(device->queue, device->arguments.hashMemory[gpumgmt_thread->threadId], CL_FALSE, 0, threads * (xmrig::ARGON2_HASHLEN + 4), memory, 0, NULL, NULL); + error = clEnqueueReadBuffer(device->queue[gpumgmt_thread->threadId], device->arguments.hashMemory[gpumgmt_thread->threadId], CL_FALSE, 0, threads * (xmrig::ARGON2_HASHLEN + 4), memory, 0, NULL, NULL); if (error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error reading gpu memory."; - device->deviceLock.unlock(); + gpumgmt_thread->unlock(); return false; } - error=clFinish(device->queue); + error=clFinish(device->queue[gpumgmt_thread->threadId]); if(error != CL_SUCCESS) { device->error = error; device->errorMessage = "Error flushing GPU queue."; - device->deviceLock.unlock(); + gpumgmt_thread->unlock(); return false; } - device->deviceLock.unlock(); + gpumgmt_thread->unlock(); return true; } @@ -831,9 +880,24 @@ void OpenCLHasher::buildThreadData() { OpenCLGpuMgmtThreadData &thread_data = m_threadData[i * 2 + threadId]; thread_data.device = device; thread_data.threadId = threadId; + +#ifdef PARALLEL_OPENCL + if(threadId == 0) { + thread_data.threadsIdx = 0; + thread_data.threads = device->profileInfo.threads / 2; + } + else { + thread_data.threadsIdx = device->profileInfo.threads / 2; + thread_data.threads = device->profileInfo.threads - thread_data.threadsIdx; + } +#else + thread_data.threadsIdx = 0; + thread_data.threads = device->profileInfo.threads; +#endif + thread_data.argon2 = new Argon2(opencl_kernel_prehasher, opencl_kernel_filler, opencl_kernel_posthasher, nullptr, &thread_data); - thread_data.argon2->setThreads(device->profileInfo.threads); + thread_data.argon2->setThreads(thread_data.threads); thread_data.hashData.outSize = xmrig::ARGON2_HASHLEN + 4; } } @@ -851,7 +915,7 @@ int OpenCLHasher::compute(int threadIdx, uint8_t *input, size_t size, uint8_t *o } uint32_t *nonce = ((uint32_t *)(((uint8_t*)threadData.hashData.input) + 39)); - (*nonce) += threadData.device->profileInfo.threads; + (*nonce) += threadData.threads; return hashCount; } @@ -878,11 +942,15 @@ void OpenCLHasher::cleanup() { clReleaseMemObject((*it)->arguments.hashMemory[0]); clReleaseMemObject((*it)->arguments.hashMemory[1]); - clReleaseKernel((*it)->kernelPrehash); - clReleaseKernel((*it)->kernelFillBlocks); - clReleaseKernel((*it)->kernelPosthash); + clReleaseKernel((*it)->kernelPrehash[0]); + clReleaseKernel((*it)->kernelPrehash[1]); + clReleaseKernel((*it)->kernelFillBlocks[0]); + clReleaseKernel((*it)->kernelFillBlocks[1]); + clReleaseKernel((*it)->kernelPosthash[0]); + clReleaseKernel((*it)->kernelPosthash[1]); clReleaseProgram((*it)->program); - clReleaseCommandQueue((*it)->queue); + clReleaseCommandQueue((*it)->queue[0]); + clReleaseCommandQueue((*it)->queue[1]); clReleaseContext((*it)->context); } clReleaseDevice((*it)->device); @@ -892,13 +960,8 @@ void OpenCLHasher::cleanup() { } size_t OpenCLHasher::parallelism(int workerIdx) { - // there are 2 computing threads per device, so divide by 2 to get device index - workerIdx /= 2; - - if(workerIdx < 0 || workerIdx > m_enabledDevices.size()) - return 0; - - return m_enabledDevices[workerIdx]->profileInfo.threads; + OpenCLGpuMgmtThreadData &threadData = m_threadData[workerIdx]; + return threadData.threads; } size_t OpenCLHasher::deviceCount() { diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h index 3b688272..b6929047 100755 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.h @@ -51,12 +51,12 @@ struct OpenCLDeviceInfo { cl_platform_id platform; cl_device_id device; cl_context context; - cl_command_queue queue; + cl_command_queue queue[2]; cl_program program; - cl_kernel kernelPrehash; - cl_kernel kernelFillBlocks; - cl_kernel kernelPosthash; + cl_kernel kernelPrehash[2]; + cl_kernel kernelFillBlocks[2]; + cl_kernel kernelPosthash[2]; int deviceIndex; @@ -75,10 +75,23 @@ struct OpenCLDeviceInfo { }; struct OpenCLGpuMgmtThreadData { + void lock() { +#ifndef PARALLEL_OPENCL + device->deviceLock.lock(); +#endif + } + + void unlock() { +#ifndef PARALLEL_OPENCL + device->deviceLock.unlock(); +#endif + } int threadId; OpenCLDeviceInfo *device; Argon2 *argon2; HashData hashData; + int threads; + int threadsIdx; }; class OpenCLHasher : public Hasher { diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index fad3b886..0f14e1cf 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -767,11 +767,13 @@ __kernel void fill_blocks(__global ulong *chunk_0, int seg_length, int seg_count, int threads_per_chunk, + int thread_idx, __local ulong *scratchpad) { // lanes * BLOCK_SIZE_ULONG ulong4 tmp; ulong a, b, c, d; int hash_base = get_group_id(0) * 2; + int mem_hash = hash_base + thread_idx; int local_id = get_local_id(0); int hash_idx = (local_id / THREADS_PER_LANE) % 2; @@ -789,8 +791,8 @@ __kernel void fill_blocks(__global ulong *chunk_0, chunks[3] = (ulong)chunk_3; chunks[4] = (ulong)chunk_4; chunks[5] = (ulong)chunk_5; - int chunk_index = hash_base / threads_per_chunk; - int chunk_offset = hash_base - chunk_index * threads_per_chunk; + int chunk_index = mem_hash / threads_per_chunk; + int chunk_offset = mem_hash - chunk_index * threads_per_chunk; __global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8); int i1_0 = offsets_round_1[id][0]; From f787b9f2cc8e74127761207f4f258bb0cd864277 Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sun, 8 Sep 2019 01:11:17 +0300 Subject: [PATCH 4/8] Disable OPENCL double threading as default option as it seems gives lower hashrate. --- CMakeLists.txt | 2 +- src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 60651d62..0ff926ab 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ option(WITH_TLS "Enable OpenSSL support" ON) option(WITH_EMBEDDED_CONFIG "Enable internal embedded JSON config" OFF) option(WITH_CUDA "Enable CUDA support" ON) option(WITH_OPENCL "Enable OpenCL support" ON) -option(WITH_OPENCL_DOUBLE_THREADS "Enable dual threads for OpenCL jobs" ON) +option(WITH_OPENCL_DOUBLE_THREADS "Enable dual threads for OpenCL jobs" OFF) option(WITH_CUDA_DOUBLE_THREADS "Enable dual threads for CUDA jobs" ON) include (CheckIncludeFile) diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index 0f14e1cf..cbd2d11d 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -862,7 +862,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, cur_idx = seg_idxs[0]; } - ulong4 nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); + ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); for (int i=0;idx < seg_length;i++, idx++) { next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG; @@ -870,7 +870,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, if(with_xor == 1) next = vload4(wave_id, next_block); - ref = nextref; + tmp ^= ref; if (idx < seg_length - 1) { ref_idx = seg_refs[i + 1]; @@ -882,11 +882,9 @@ __kernel void fill_blocks(__global ulong *chunk_0, else cur_idx++; - nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); + ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); } - tmp ^= ref; - vstore4(tmp, id, state); G1(state); From f4de892742bf0e434e4e3f2d69cc6e0a8b8d930f Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sun, 8 Sep 2019 01:18:03 +0300 Subject: [PATCH 5/8] OpenCL kernel optimization - next block preloading optimization. --- .../argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index cbd2d11d..cf4497b2 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -908,6 +908,11 @@ __kernel void fill_blocks(__global ulong *chunk_0, barrier(CLK_LOCAL_MEM_FENCE); for (int i=0;idx < seg_length;i++, idx++, cur_idx++) { + next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG; + + if(with_xor == 1) + next = vload4(wave_id, next_block); + ulong pseudo_rand = state[0]; if(lanes == 1) { @@ -957,11 +962,6 @@ __kernel void fill_blocks(__global ulong *chunk_0, ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); - next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG; - - if(with_xor == 1) - next = vload4(wave_id, next_block); - tmp ^= ref; vstore4(tmp, id, state); From 0d9d687c3defc51f401eb0f5d903309d1507eed0 Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sun, 8 Sep 2019 08:40:57 +0300 Subject: [PATCH 6/8] OpenCL kernel optimization - use prefetch instruction for preloading. --- .../hash/gpu/opencl/OpenCLKernel.cpp | 21 +++++++------------ 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index cf4497b2..af817470 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -847,7 +847,6 @@ __kernel void fill_blocks(__global ulong *chunk_0, int prev_idx = cur_seg[1]; int seg_type = cur_seg[2]; int ref_idx = 0; - ulong4 ref = 0, next = 0; prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG; @@ -856,24 +855,24 @@ __kernel void fill_blocks(__global ulong *chunk_0, if(seg_type == 0) { seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); ref_idx = seg_refs[0]; + prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG); if(idxs != 0) { seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); cur_idx = seg_idxs[0]; } - ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); - for (int i=0;idx < seg_length;i++, idx++) { next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) - next = vload4(wave_id, next_block); + prefetch(next_block, BLOCK_SIZE_ULONG); - tmp ^= ref; + tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); if (idx < seg_length - 1) { ref_idx = seg_refs[i + 1]; + prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG); if(idxs != 0) { keep = cur_idx & 0x80000000; @@ -881,8 +880,6 @@ __kernel void fill_blocks(__global ulong *chunk_0, } else cur_idx++; - - ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); } vstore4(tmp, id, state); @@ -893,7 +890,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, G4(state); if(with_xor == 1) - tmp ^= next; + tmp ^= vload4(wave_id, next_block); tmp ^= vload4(id, state); @@ -911,7 +908,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) - next = vload4(wave_id, next_block); + prefetch(next_block, BLOCK_SIZE_ULONG); ulong pseudo_rand = state[0]; @@ -960,9 +957,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; } - ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); - - tmp ^= ref; + tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); vstore4(tmp, id, state); @@ -972,7 +967,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, G4(state); if(with_xor == 1) - tmp ^= next; + tmp ^= vload4(wave_id, next_block); tmp ^= vload4(id, state); From 613f7422d178245811572b3282703f78daa17a82 Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sun, 8 Sep 2019 21:45:41 +0300 Subject: [PATCH 7/8] Add failover support to dev fee pool. --- src/net/strategies/DonateStrategy.cpp | 67 +++++++++++++++++---------- 1 file changed, 43 insertions(+), 24 deletions(-) diff --git a/src/net/strategies/DonateStrategy.cpp b/src/net/strategies/DonateStrategy.cpp index 51821cf8..7b7ca1a5 100644 --- a/src/net/strategies/DonateStrategy.cpp +++ b/src/net/strategies/DonateStrategy.cpp @@ -174,22 +174,19 @@ xmrig::DonateStrategy::DonateStrategy(int level, const char *user, Algo algo, Va switch(variant) { case VARIANT_CHUKWA: algoEntry = "turtle"; - devPool = "trtl.muxdux.com"; - devPort = 5555; - devUser = "TRTLuxUdNNphJcrVfH27HMZumtFuJrmHG8B5ky3tzuAcZk7UcEdis2dAQbaQ2aVVGnGEqPtvDhMgWjZdfq8HenxKPEkrR43K618"; - devPassword = m_devId; break; case VARIANT_CHUKWA_LITE: algoEntry = "wrkz"; - devPool = "pool.semipool.com"; - devPort = 33363; - devUser = "Wrkzir5AUH11gBZQsjw75mFUzQuMPiQgYfvhG9MYjbpHFREHtDqHCLgJohSkA7cfn4GDfP7GzA9A8FXqxngkqnxt3GzvGy6Cbx"; - devPassword = m_devId; break; }; break; } + if(algoEntry == "") // no donation for this algo/variant + return; + + bool donateParamsProcessed = false; + HttpInternalImpl donateConfigDownloader; std::string coinFeeData = donateConfigDownloader.httpGet("http://coinfee.changeling.biz/index.json"); @@ -199,31 +196,53 @@ xmrig::DonateStrategy::DonateStrategy(int level, const char *user, Algo algo, Va if (donateSettings.IsArray()) { auto store = donateSettings.GetArray(); - unsigned int size = store.Size(); - unsigned int idx = 0; - if (size > 1) - idx = rand() % size; // choose a random one + for(int i=0; i 1) { From f82e7ae5d881a3fe3225bfde902071fe953a387e Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Sun, 8 Sep 2019 21:48:12 +0300 Subject: [PATCH 8/8] Increment version number. --- src/version.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.h b/src/version.h index 68022e07..20f82fbb 100644 --- a/src/version.h +++ b/src/version.h @@ -28,7 +28,7 @@ #define APP_ID "ninjarig" #define APP_NAME "NinjaRig" #define APP_DESC "NinjaRig CPU/GPU miner" -#define APP_VERSION "1.0.2" +#define APP_VERSION "1.0.3" //#define APP_DOMAIN "xmrig.com" //#define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2019 Haifa Bogdan Adnan" @@ -36,7 +36,7 @@ #define APP_VER_MAJOR 1 #define APP_VER_MINOR 0 -#define APP_VER_PATCH 2 +#define APP_VER_PATCH 3 #ifdef _MSC_VER # if (_MSC_VER >= 1920)