commit
1e881cb01a
7 changed files with 352 additions and 192 deletions
|
@ -8,6 +8,8 @@ option(WITH_TLS "Enable OpenSSL support" ON)
|
||||||
option(WITH_EMBEDDED_CONFIG "Enable internal embedded JSON config" OFF)
|
option(WITH_EMBEDDED_CONFIG "Enable internal embedded JSON config" OFF)
|
||||||
option(WITH_CUDA "Enable CUDA support" ON)
|
option(WITH_CUDA "Enable CUDA support" ON)
|
||||||
option(WITH_OPENCL "Enable OpenCL support" ON)
|
option(WITH_OPENCL "Enable OpenCL support" ON)
|
||||||
|
option(WITH_OPENCL_DOUBLE_THREADS "Enable dual threads for OpenCL jobs" OFF)
|
||||||
|
option(WITH_CUDA_DOUBLE_THREADS "Enable dual threads for CUDA jobs" ON)
|
||||||
|
|
||||||
include (CheckIncludeFile)
|
include (CheckIncludeFile)
|
||||||
include (cmake/cpu.cmake)
|
include (cmake/cpu.cmake)
|
||||||
|
@ -373,6 +375,9 @@ endif(ARCH STREQUAL "arm" OR ARCH STREQUAL "aarch64")
|
||||||
if(WITH_OPENCL)
|
if(WITH_OPENCL)
|
||||||
add_definitions(-DWITH_OPENCL)
|
add_definitions(-DWITH_OPENCL)
|
||||||
find_package(OpenCL REQUIRED)
|
find_package(OpenCL REQUIRED)
|
||||||
|
if(WITH_OPENCL_DOUBLE_THREADS)
|
||||||
|
add_definitions(-DPARALLEL_OPENCL)
|
||||||
|
endif()
|
||||||
include_directories(${OpenCL_INCLUDE_DIR})
|
include_directories(${OpenCL_INCLUDE_DIR})
|
||||||
add_library(opencl_hasher MODULE ${SOURCE_OPENCL_HASHER})
|
add_library(opencl_hasher MODULE ${SOURCE_OPENCL_HASHER})
|
||||||
set_target_properties(opencl_hasher
|
set_target_properties(opencl_hasher
|
||||||
|
@ -388,7 +393,7 @@ endif()
|
||||||
if(WITH_CUDA)
|
if(WITH_CUDA)
|
||||||
add_definitions(-DWITH_CUDA)
|
add_definitions(-DWITH_CUDA)
|
||||||
find_package(CUDA REQUIRED)
|
find_package(CUDA REQUIRED)
|
||||||
if(NOT WIN32)
|
if(WITH_CUDA_DOUBLE_THREADS)
|
||||||
add_definitions(-DPARALLEL_CUDA)
|
add_definitions(-DPARALLEL_CUDA)
|
||||||
endif()
|
endif()
|
||||||
set(
|
set(
|
||||||
|
|
|
@ -589,18 +589,19 @@ __global__ void fill_blocks(uint32_t *scratchpad0,
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t pseudo_rand_lo = __shfl_sync(0xffffffff, tmp_a.x, 0);
|
uint32_t pseudo_rand_lo = __shfl_sync(0xffffffff, tmp_a.x, 0);
|
||||||
|
|
||||||
|
if(lanes > 1) {
|
||||||
uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0);
|
uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0);
|
||||||
|
|
||||||
uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost
|
uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost
|
||||||
uint32_t reference_area_size = 0;
|
uint32_t reference_area_size = 0;
|
||||||
if(pass > 0) {
|
if (pass > 0) {
|
||||||
if (lane == ref_lane) {
|
if (lane == ref_lane) {
|
||||||
reference_area_size = lane_length - seg_length + idx - 1;
|
reference_area_size = lane_length - seg_length + idx - 1;
|
||||||
} else {
|
} else {
|
||||||
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
|
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
else {
|
|
||||||
if (lane == ref_lane) {
|
if (lane == ref_lane) {
|
||||||
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||||
} else {
|
} else {
|
||||||
|
@ -611,7 +612,24 @@ __global__ void fill_blocks(uint32_t *scratchpad0,
|
||||||
|
|
||||||
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
|
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
|
||||||
|
|
||||||
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
ref_idx = ref_lane * lane_length +
|
||||||
|
(((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) %
|
||||||
|
lane_length;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
uint32_t reference_area_size = 0;
|
||||||
|
if (pass > 0) {
|
||||||
|
reference_area_size = lane_length - seg_length + idx - 1;
|
||||||
|
} else {
|
||||||
|
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||||
|
}
|
||||||
|
asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size));
|
||||||
|
|
||||||
|
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
|
||||||
|
|
||||||
|
ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) %
|
||||||
|
lane_length;
|
||||||
|
}
|
||||||
|
|
||||||
ref_block = memory + ref_idx * BLOCK_SIZE_UINT4;
|
ref_block = memory + ref_idx * BLOCK_SIZE_UINT4;
|
||||||
|
|
||||||
|
|
|
@ -196,6 +196,24 @@ OpenCLDeviceInfo *OpenCLHasher::getDeviceInfo(cl_platform_id platform, cl_device
|
||||||
|
|
||||||
device_info->deviceString = device_vendor + " - " + device_name/* + " : " + device_version*/;
|
device_info->deviceString = device_vendor + " - " + device_name/* + " : " + device_version*/;
|
||||||
|
|
||||||
|
string extensions;
|
||||||
|
sz = 0;
|
||||||
|
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &sz);
|
||||||
|
buffer = (char *)malloc(sz + 1);
|
||||||
|
device_info->error = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sz, buffer, &sz);
|
||||||
|
if(device_info->error != CL_SUCCESS) {
|
||||||
|
free(buffer);
|
||||||
|
device_info->errorMessage = "Error querying device extensions.";
|
||||||
|
return device_info;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
buffer[sz] = 0;
|
||||||
|
extensions = buffer;
|
||||||
|
free(buffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
device_info->deviceExtensions = extensions;
|
||||||
|
|
||||||
device_info->error = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_info->maxMemSize), &(device_info->maxMemSize), NULL);
|
device_info->error = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_info->maxMemSize), &(device_info->maxMemSize), NULL);
|
||||||
if(device_info->error != CL_SUCCESS) {
|
if(device_info->error != CL_SUCCESS) {
|
||||||
device_info->errorMessage = "Error querying device global memory size.";
|
device_info->errorMessage = "Error querying device global memory size.";
|
||||||
|
@ -345,7 +363,14 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
device->queue = clCreateCommandQueue(device->context, device->device, CL_QUEUE_PROFILING_ENABLE, &error);
|
device->queue[0] = clCreateCommandQueue(device->context, device->device, NULL, &error);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
device->error = error;
|
||||||
|
device->errorMessage = "Error getting device command queue.";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
device->queue[1] = clCreateCommandQueue(device->context, device->device, NULL, &error);
|
||||||
if (error != CL_SUCCESS) {
|
if (error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error getting device command queue.";
|
device->errorMessage = "Error getting device command queue.";
|
||||||
|
@ -362,7 +387,10 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
error = clBuildProgram(device->program, 1, &device->device, "", NULL, NULL);
|
string options = "";
|
||||||
|
if(device->deviceExtensions.find("cl_amd_media_ops") != string::npos)
|
||||||
|
options += "-D USE_AMD_BITALIGN";
|
||||||
|
error = clBuildProgram(device->program, 1, &device->device, options.c_str(), NULL, NULL);
|
||||||
if (error != CL_SUCCESS) {
|
if (error != CL_SUCCESS) {
|
||||||
size_t log_size;
|
size_t log_size;
|
||||||
clGetProgramBuildInfo(device->program, device->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
clGetProgramBuildInfo(device->program, device->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||||
|
@ -377,26 +405,44 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
device->kernelPrehash = clCreateKernel(device->program, "prehash", &error);
|
device->kernelPrehash[0] = clCreateKernel(device->program, "prehash", &error);
|
||||||
if (error != CL_SUCCESS) {
|
if (error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error creating opencl prehash kernel for device.";
|
device->errorMessage = "Error creating opencl prehash kernel for device.";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
device->kernelFillBlocks = clCreateKernel(device->program, "fill_blocks", &error);
|
device->kernelPrehash[1] = clCreateKernel(device->program, "prehash", &error);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
device->error = error;
|
||||||
|
device->errorMessage = "Error creating opencl prehash kernel for device.";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
device->kernelFillBlocks[0] = clCreateKernel(device->program, "fill_blocks", &error);
|
||||||
if (error != CL_SUCCESS) {
|
if (error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error creating opencl main kernel for device.";
|
device->errorMessage = "Error creating opencl main kernel for device.";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
device->kernelPosthash = clCreateKernel(device->program, "posthash", &error);
|
device->kernelFillBlocks[1] = clCreateKernel(device->program, "fill_blocks", &error);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
device->error = error;
|
||||||
|
device->errorMessage = "Error creating opencl main kernel for device.";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
device->kernelPosthash[0] = clCreateKernel(device->program, "posthash", &error);
|
||||||
|
if (error != CL_SUCCESS) {
|
||||||
|
device->error = error;
|
||||||
|
device->errorMessage = "Error creating opencl posthash kernel for device.";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
device->kernelPosthash[1] = clCreateKernel(device->program, "posthash", &error);
|
||||||
if (error != CL_SUCCESS) {
|
if (error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error creating opencl posthash kernel for device.";
|
device->errorMessage = "Error creating opencl posthash kernel for device.";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
device->profileInfo.threads_per_chunk = (uint32_t) (device->maxAllocableMemSize / device->profileInfo.profile->memSize);
|
device->profileInfo.threads_per_chunk = ((uint32_t) (device->maxAllocableMemSize / device->profileInfo.profile->memSize) / 2) * 2; // make it divisible by 2 to allow 2 hashes per wavefront
|
||||||
size_t chunk_size = device->profileInfo.threads_per_chunk * device->profileInfo.profile->memSize;
|
size_t chunk_size = device->profileInfo.threads_per_chunk * device->profileInfo.profile->memSize;
|
||||||
|
|
||||||
if (chunk_size == 0) {
|
if (chunk_size == 0) {
|
||||||
|
@ -629,7 +675,7 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
|
||||||
refs[i] = device->profileInfo.profile->blockRefs[i * 3 + 1];
|
refs[i] = device->profileInfo.profile->blockRefs[i * 3 + 1];
|
||||||
}
|
}
|
||||||
|
|
||||||
error=clEnqueueWriteBuffer(device->queue, device->arguments.refs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), refs, 0, NULL, NULL);
|
error=clEnqueueWriteBuffer(device->queue[0], device->arguments.refs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), refs, 0, NULL, NULL);
|
||||||
if(error != CL_SUCCESS) {
|
if(error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error writing to gpu memory.";
|
device->errorMessage = "Error writing to gpu memory.";
|
||||||
|
@ -647,7 +693,7 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
error=clEnqueueWriteBuffer(device->queue, device->arguments.idxs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), idxs, 0, NULL, NULL);
|
error=clEnqueueWriteBuffer(device->queue[0], device->arguments.idxs, CL_TRUE, 0, device->profileInfo.profile->blockRefsSize * sizeof(uint32_t), idxs, 0, NULL, NULL);
|
||||||
if(error != CL_SUCCESS) {
|
if(error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error writing to gpu memory.";
|
device->errorMessage = "Error writing to gpu memory.";
|
||||||
|
@ -657,37 +703,60 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
|
||||||
free(idxs);
|
free(idxs);
|
||||||
}
|
}
|
||||||
|
|
||||||
error=clEnqueueWriteBuffer(device->queue, device->arguments.segments, CL_TRUE, 0, device->profileInfo.profile->segCount * 3 * sizeof(uint32_t), device->profileInfo.profile->segments, 0, NULL, NULL);
|
error=clEnqueueWriteBuffer(device->queue[0], device->arguments.segments, CL_TRUE, 0, device->profileInfo.profile->segCount * 3 * sizeof(uint32_t), device->profileInfo.profile->segments, 0, NULL, NULL);
|
||||||
if(error != CL_SUCCESS) {
|
if(error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error writing to gpu memory.";
|
device->errorMessage = "Error writing to gpu memory.";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 0, sizeof(device->arguments.memoryChunk_0), &device->arguments.memoryChunk_0);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 1, sizeof(device->arguments.memoryChunk_1), &device->arguments.memoryChunk_1);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 2, sizeof(device->arguments.memoryChunk_2), &device->arguments.memoryChunk_2);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 3, sizeof(device->arguments.memoryChunk_3), &device->arguments.memoryChunk_3);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 4, sizeof(device->arguments.memoryChunk_4), &device->arguments.memoryChunk_4);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 5, sizeof(device->arguments.memoryChunk_5), &device->arguments.memoryChunk_5);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 8, sizeof(device->arguments.refs), &device->arguments.refs);
|
|
||||||
if(device->profileInfo.profile->succesiveIdxs == 0)
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 9, sizeof(device->arguments.idxs), &device->arguments.idxs);
|
|
||||||
else
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 9, sizeof(cl_mem), NULL);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 10, sizeof(device->arguments.segments), &device->arguments.segments);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 11, sizeof(int32_t), &device->profileInfo.profile->memSize);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 12, sizeof(int32_t), &device->profileInfo.profile->thrCost);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 13, sizeof(int32_t), &device->profileInfo.profile->segSize);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 14, sizeof(int32_t), &device->profileInfo.profile->segCount);
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 15, sizeof(int32_t), &device->profileInfo.threads_per_chunk);
|
|
||||||
|
|
||||||
clSetKernelArg(device->kernelPrehash, 2, sizeof(int32_t), &device->profileInfo.profile->memCost);
|
|
||||||
clSetKernelArg(device->kernelPrehash, 3, sizeof(int32_t), &device->profileInfo.profile->thrCost);
|
|
||||||
int passes = device->profileInfo.profile->segCount / (4 * device->profileInfo.profile->thrCost);
|
int passes = device->profileInfo.profile->segCount / (4 * device->profileInfo.profile->thrCost);
|
||||||
clSetKernelArg(device->kernelPrehash, 4, sizeof(int32_t), &passes);
|
|
||||||
clSetKernelArg(device->kernelPrehash, 6, sizeof(int32_t), &device->profileInfo.profile->saltLen);
|
|
||||||
|
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 0, sizeof(device->arguments.memoryChunk_0), &device->arguments.memoryChunk_0);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 1, sizeof(device->arguments.memoryChunk_1), &device->arguments.memoryChunk_1);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 2, sizeof(device->arguments.memoryChunk_2), &device->arguments.memoryChunk_2);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 3, sizeof(device->arguments.memoryChunk_3), &device->arguments.memoryChunk_3);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 4, sizeof(device->arguments.memoryChunk_4), &device->arguments.memoryChunk_4);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 5, sizeof(device->arguments.memoryChunk_5), &device->arguments.memoryChunk_5);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 8, sizeof(device->arguments.refs), &device->arguments.refs);
|
||||||
|
if(device->profileInfo.profile->succesiveIdxs == 0)
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 9, sizeof(device->arguments.idxs), &device->arguments.idxs);
|
||||||
|
else
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 9, sizeof(cl_mem), NULL);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 10, sizeof(device->arguments.segments), &device->arguments.segments);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 11, sizeof(int32_t), &device->profileInfo.profile->memSize);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 12, sizeof(int32_t), &device->profileInfo.profile->thrCost);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 13, sizeof(int32_t), &device->profileInfo.profile->segSize);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 14, sizeof(int32_t), &device->profileInfo.profile->segCount);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[0], 15, sizeof(int32_t), &device->profileInfo.threads_per_chunk);
|
||||||
|
|
||||||
|
clSetKernelArg(device->kernelPrehash[0], 2, sizeof(int32_t), &device->profileInfo.profile->memCost);
|
||||||
|
clSetKernelArg(device->kernelPrehash[0], 3, sizeof(int32_t), &device->profileInfo.profile->thrCost);
|
||||||
|
clSetKernelArg(device->kernelPrehash[0], 4, sizeof(int32_t), &passes);
|
||||||
|
clSetKernelArg(device->kernelPrehash[0], 6, sizeof(int32_t), &device->profileInfo.profile->saltLen);
|
||||||
|
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 0, sizeof(device->arguments.memoryChunk_0), &device->arguments.memoryChunk_0);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 1, sizeof(device->arguments.memoryChunk_1), &device->arguments.memoryChunk_1);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 2, sizeof(device->arguments.memoryChunk_2), &device->arguments.memoryChunk_2);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 3, sizeof(device->arguments.memoryChunk_3), &device->arguments.memoryChunk_3);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 4, sizeof(device->arguments.memoryChunk_4), &device->arguments.memoryChunk_4);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 5, sizeof(device->arguments.memoryChunk_5), &device->arguments.memoryChunk_5);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 8, sizeof(device->arguments.refs), &device->arguments.refs);
|
||||||
|
if(device->profileInfo.profile->succesiveIdxs == 0)
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 9, sizeof(device->arguments.idxs), &device->arguments.idxs);
|
||||||
|
else
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 9, sizeof(cl_mem), NULL);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 10, sizeof(device->arguments.segments), &device->arguments.segments);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 11, sizeof(int32_t), &device->profileInfo.profile->memSize);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 12, sizeof(int32_t), &device->profileInfo.profile->thrCost);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 13, sizeof(int32_t), &device->profileInfo.profile->segSize);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 14, sizeof(int32_t), &device->profileInfo.profile->segCount);
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[1], 15, sizeof(int32_t), &device->profileInfo.threads_per_chunk);
|
||||||
|
|
||||||
|
clSetKernelArg(device->kernelPrehash[1], 2, sizeof(int32_t), &device->profileInfo.profile->memCost);
|
||||||
|
clSetKernelArg(device->kernelPrehash[1], 3, sizeof(int32_t), &device->profileInfo.profile->thrCost);
|
||||||
|
clSetKernelArg(device->kernelPrehash[1], 4, sizeof(int32_t), &passes);
|
||||||
|
clSetKernelArg(device->kernelPrehash[1], 6, sizeof(int32_t), &device->profileInfo.profile->saltLen);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -703,29 +772,29 @@ bool opencl_kernel_prehasher(void *memory, int threads, Argon2Profile *profile,
|
||||||
size_t total_work_items = sessions * 4 * ceil(threads / hashes_per_block);
|
size_t total_work_items = sessions * 4 * ceil(threads / hashes_per_block);
|
||||||
size_t local_work_items = sessions * 4;
|
size_t local_work_items = sessions * 4;
|
||||||
|
|
||||||
device->deviceLock.lock();
|
gpumgmt_thread->lock();
|
||||||
|
|
||||||
error = clEnqueueWriteBuffer(device->queue, device->arguments.preseedMemory[gpumgmt_thread->threadId],
|
error = clEnqueueWriteBuffer(device->queue[gpumgmt_thread->threadId], device->arguments.preseedMemory[gpumgmt_thread->threadId],
|
||||||
CL_FALSE, 0, gpumgmt_thread->hashData.inSize, memory, 0, NULL, NULL);
|
CL_FALSE, 0, gpumgmt_thread->hashData.inSize, memory, 0, NULL, NULL);
|
||||||
if (error != CL_SUCCESS) {
|
if (error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error writing to gpu memory.";
|
device->errorMessage = "Error writing to gpu memory.";
|
||||||
device->deviceLock.unlock();
|
gpumgmt_thread->unlock();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
int inSizeInInt = gpumgmt_thread->hashData.inSize / 4;
|
int inSizeInInt = gpumgmt_thread->hashData.inSize / 4;
|
||||||
clSetKernelArg(device->kernelPrehash, 0, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]);
|
clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 0, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]);
|
||||||
clSetKernelArg(device->kernelPrehash, 1, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]);
|
clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 1, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]);
|
||||||
clSetKernelArg(device->kernelPrehash, 5, sizeof(int), &inSizeInInt);
|
clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 5, sizeof(int), &inSizeInInt);
|
||||||
clSetKernelArg(device->kernelPrehash, 7, sizeof(int), &threads);
|
clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 7, sizeof(int), &threads);
|
||||||
clSetKernelArg(device->kernelPrehash, 8, sessions * sizeof(cl_ulong) * 76, NULL); // (preseed size is 16 ulongs = 128 bytes)
|
clSetKernelArg(device->kernelPrehash[gpumgmt_thread->threadId], 8, sessions * sizeof(cl_ulong) * 76, NULL); // (preseed size is 16 ulongs = 128 bytes)
|
||||||
|
|
||||||
error=clEnqueueNDRangeKernel(device->queue, device->kernelPrehash, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
|
error=clEnqueueNDRangeKernel(device->queue[gpumgmt_thread->threadId], device->kernelPrehash[gpumgmt_thread->threadId], 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
|
||||||
if(error != CL_SUCCESS) {
|
if(error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error running the kernel.";
|
device->errorMessage = "Error running the kernel.";
|
||||||
device->deviceLock.unlock();
|
gpumgmt_thread->unlock();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -739,19 +808,20 @@ void *opencl_kernel_filler(int threads, Argon2Profile *profile, void *user_data)
|
||||||
cl_int error;
|
cl_int error;
|
||||||
|
|
||||||
size_t total_work_items = threads * KERNEL_WORKGROUP_SIZE * profile->thrCost;
|
size_t total_work_items = threads * KERNEL_WORKGROUP_SIZE * profile->thrCost;
|
||||||
size_t local_work_items = KERNEL_WORKGROUP_SIZE * profile->thrCost;
|
size_t local_work_items = 2 * KERNEL_WORKGROUP_SIZE * profile->thrCost;
|
||||||
|
|
||||||
size_t shared_mem = profile->thrCost * ARGON2_QWORDS_IN_BLOCK;
|
size_t shared_mem = 2 * profile->thrCost * ARGON2_QWORDS_IN_BLOCK;
|
||||||
|
|
||||||
clSetKernelArg(device->kernelFillBlocks, 6, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]);
|
clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 6, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]);
|
||||||
clSetKernelArg(device->kernelFillBlocks, 7, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]);
|
clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 7, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]);
|
||||||
clSetKernelArg(device->kernelFillBlocks, 16, sizeof(cl_ulong) * shared_mem, NULL);
|
clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 16, sizeof(int), &(gpumgmt_thread->threadsIdx));
|
||||||
|
clSetKernelArg(device->kernelFillBlocks[gpumgmt_thread->threadId], 17, sizeof(cl_ulong) * shared_mem, NULL);
|
||||||
|
|
||||||
error=clEnqueueNDRangeKernel(device->queue, device->kernelFillBlocks, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
|
error=clEnqueueNDRangeKernel(device->queue[gpumgmt_thread->threadId], device->kernelFillBlocks[gpumgmt_thread->threadId], 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
|
||||||
if(error != CL_SUCCESS) {
|
if(error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error running the kernel.";
|
device->errorMessage = "Error running the kernel.";
|
||||||
device->deviceLock.unlock();
|
gpumgmt_thread->unlock();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -767,36 +837,36 @@ bool opencl_kernel_posthasher(void *memory, int threads, Argon2Profile *profile,
|
||||||
size_t total_work_items = threads * 4;
|
size_t total_work_items = threads * 4;
|
||||||
size_t local_work_items = 4;
|
size_t local_work_items = 4;
|
||||||
|
|
||||||
clSetKernelArg(device->kernelPosthash, 0, sizeof(device->arguments.hashMemory[gpumgmt_thread->threadId]), &device->arguments.hashMemory[gpumgmt_thread->threadId]);
|
clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 0, sizeof(device->arguments.hashMemory[gpumgmt_thread->threadId]), &device->arguments.hashMemory[gpumgmt_thread->threadId]);
|
||||||
clSetKernelArg(device->kernelPosthash, 1, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]);
|
clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 1, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]);
|
||||||
clSetKernelArg(device->kernelPosthash, 2, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]);
|
clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 2, sizeof(device->arguments.preseedMemory[gpumgmt_thread->threadId]), &device->arguments.preseedMemory[gpumgmt_thread->threadId]);
|
||||||
clSetKernelArg(device->kernelPosthash, 3, sizeof(cl_ulong) * 60, NULL);
|
clSetKernelArg(device->kernelPosthash[gpumgmt_thread->threadId], 3, sizeof(cl_ulong) * 60, NULL);
|
||||||
|
|
||||||
error=clEnqueueNDRangeKernel(device->queue, device->kernelPosthash, 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
|
error=clEnqueueNDRangeKernel(device->queue[gpumgmt_thread->threadId], device->kernelPosthash[gpumgmt_thread->threadId], 1, NULL, &total_work_items, &local_work_items, 0, NULL, NULL);
|
||||||
if(error != CL_SUCCESS) {
|
if(error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error running the kernel.";
|
device->errorMessage = "Error running the kernel.";
|
||||||
device->deviceLock.unlock();
|
gpumgmt_thread->unlock();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
error = clEnqueueReadBuffer(device->queue, device->arguments.hashMemory[gpumgmt_thread->threadId], CL_FALSE, 0, threads * (xmrig::ARGON2_HASHLEN + 4), memory, 0, NULL, NULL);
|
error = clEnqueueReadBuffer(device->queue[gpumgmt_thread->threadId], device->arguments.hashMemory[gpumgmt_thread->threadId], CL_FALSE, 0, threads * (xmrig::ARGON2_HASHLEN + 4), memory, 0, NULL, NULL);
|
||||||
if (error != CL_SUCCESS) {
|
if (error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error reading gpu memory.";
|
device->errorMessage = "Error reading gpu memory.";
|
||||||
device->deviceLock.unlock();
|
gpumgmt_thread->unlock();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
error=clFinish(device->queue);
|
error=clFinish(device->queue[gpumgmt_thread->threadId]);
|
||||||
if(error != CL_SUCCESS) {
|
if(error != CL_SUCCESS) {
|
||||||
device->error = error;
|
device->error = error;
|
||||||
device->errorMessage = "Error flushing GPU queue.";
|
device->errorMessage = "Error flushing GPU queue.";
|
||||||
device->deviceLock.unlock();
|
gpumgmt_thread->unlock();
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
device->deviceLock.unlock();
|
gpumgmt_thread->unlock();
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
@ -810,9 +880,24 @@ void OpenCLHasher::buildThreadData() {
|
||||||
OpenCLGpuMgmtThreadData &thread_data = m_threadData[i * 2 + threadId];
|
OpenCLGpuMgmtThreadData &thread_data = m_threadData[i * 2 + threadId];
|
||||||
thread_data.device = device;
|
thread_data.device = device;
|
||||||
thread_data.threadId = threadId;
|
thread_data.threadId = threadId;
|
||||||
|
|
||||||
|
#ifdef PARALLEL_OPENCL
|
||||||
|
if(threadId == 0) {
|
||||||
|
thread_data.threadsIdx = 0;
|
||||||
|
thread_data.threads = device->profileInfo.threads / 2;
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
thread_data.threadsIdx = device->profileInfo.threads / 2;
|
||||||
|
thread_data.threads = device->profileInfo.threads - thread_data.threadsIdx;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
thread_data.threadsIdx = 0;
|
||||||
|
thread_data.threads = device->profileInfo.threads;
|
||||||
|
#endif
|
||||||
|
|
||||||
thread_data.argon2 = new Argon2(opencl_kernel_prehasher, opencl_kernel_filler, opencl_kernel_posthasher,
|
thread_data.argon2 = new Argon2(opencl_kernel_prehasher, opencl_kernel_filler, opencl_kernel_posthasher,
|
||||||
nullptr, &thread_data);
|
nullptr, &thread_data);
|
||||||
thread_data.argon2->setThreads(device->profileInfo.threads);
|
thread_data.argon2->setThreads(thread_data.threads);
|
||||||
thread_data.hashData.outSize = xmrig::ARGON2_HASHLEN + 4;
|
thread_data.hashData.outSize = xmrig::ARGON2_HASHLEN + 4;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -830,7 +915,7 @@ int OpenCLHasher::compute(int threadIdx, uint8_t *input, size_t size, uint8_t *o
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t *nonce = ((uint32_t *)(((uint8_t*)threadData.hashData.input) + 39));
|
uint32_t *nonce = ((uint32_t *)(((uint8_t*)threadData.hashData.input) + 39));
|
||||||
(*nonce) += threadData.device->profileInfo.threads;
|
(*nonce) += threadData.threads;
|
||||||
|
|
||||||
return hashCount;
|
return hashCount;
|
||||||
}
|
}
|
||||||
|
@ -857,11 +942,15 @@ void OpenCLHasher::cleanup() {
|
||||||
clReleaseMemObject((*it)->arguments.hashMemory[0]);
|
clReleaseMemObject((*it)->arguments.hashMemory[0]);
|
||||||
clReleaseMemObject((*it)->arguments.hashMemory[1]);
|
clReleaseMemObject((*it)->arguments.hashMemory[1]);
|
||||||
|
|
||||||
clReleaseKernel((*it)->kernelPrehash);
|
clReleaseKernel((*it)->kernelPrehash[0]);
|
||||||
clReleaseKernel((*it)->kernelFillBlocks);
|
clReleaseKernel((*it)->kernelPrehash[1]);
|
||||||
clReleaseKernel((*it)->kernelPosthash);
|
clReleaseKernel((*it)->kernelFillBlocks[0]);
|
||||||
|
clReleaseKernel((*it)->kernelFillBlocks[1]);
|
||||||
|
clReleaseKernel((*it)->kernelPosthash[0]);
|
||||||
|
clReleaseKernel((*it)->kernelPosthash[1]);
|
||||||
clReleaseProgram((*it)->program);
|
clReleaseProgram((*it)->program);
|
||||||
clReleaseCommandQueue((*it)->queue);
|
clReleaseCommandQueue((*it)->queue[0]);
|
||||||
|
clReleaseCommandQueue((*it)->queue[1]);
|
||||||
clReleaseContext((*it)->context);
|
clReleaseContext((*it)->context);
|
||||||
}
|
}
|
||||||
clReleaseDevice((*it)->device);
|
clReleaseDevice((*it)->device);
|
||||||
|
@ -871,13 +960,8 @@ void OpenCLHasher::cleanup() {
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t OpenCLHasher::parallelism(int workerIdx) {
|
size_t OpenCLHasher::parallelism(int workerIdx) {
|
||||||
// there are 2 computing threads per device, so divide by 2 to get device index
|
OpenCLGpuMgmtThreadData &threadData = m_threadData[workerIdx];
|
||||||
workerIdx /= 2;
|
return threadData.threads;
|
||||||
|
|
||||||
if(workerIdx < 0 || workerIdx > m_enabledDevices.size())
|
|
||||||
return 0;
|
|
||||||
|
|
||||||
return m_enabledDevices[workerIdx]->profileInfo.threads;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t OpenCLHasher::deviceCount() {
|
size_t OpenCLHasher::deviceCount() {
|
||||||
|
|
|
@ -51,12 +51,12 @@ struct OpenCLDeviceInfo {
|
||||||
cl_platform_id platform;
|
cl_platform_id platform;
|
||||||
cl_device_id device;
|
cl_device_id device;
|
||||||
cl_context context;
|
cl_context context;
|
||||||
cl_command_queue queue;
|
cl_command_queue queue[2];
|
||||||
|
|
||||||
cl_program program;
|
cl_program program;
|
||||||
cl_kernel kernelPrehash;
|
cl_kernel kernelPrehash[2];
|
||||||
cl_kernel kernelFillBlocks;
|
cl_kernel kernelFillBlocks[2];
|
||||||
cl_kernel kernelPosthash;
|
cl_kernel kernelPosthash[2];
|
||||||
|
|
||||||
int deviceIndex;
|
int deviceIndex;
|
||||||
|
|
||||||
|
@ -64,6 +64,7 @@ struct OpenCLDeviceInfo {
|
||||||
Argon2ProfileInfo profileInfo;
|
Argon2ProfileInfo profileInfo;
|
||||||
|
|
||||||
string deviceString;
|
string deviceString;
|
||||||
|
string deviceExtensions;
|
||||||
uint64_t maxMemSize;
|
uint64_t maxMemSize;
|
||||||
uint64_t maxAllocableMemSize;
|
uint64_t maxAllocableMemSize;
|
||||||
|
|
||||||
|
@ -74,10 +75,23 @@ struct OpenCLDeviceInfo {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct OpenCLGpuMgmtThreadData {
|
struct OpenCLGpuMgmtThreadData {
|
||||||
|
void lock() {
|
||||||
|
#ifndef PARALLEL_OPENCL
|
||||||
|
device->deviceLock.lock();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
void unlock() {
|
||||||
|
#ifndef PARALLEL_OPENCL
|
||||||
|
device->deviceLock.unlock();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
int threadId;
|
int threadId;
|
||||||
OpenCLDeviceInfo *device;
|
OpenCLDeviceInfo *device;
|
||||||
Argon2 *argon2;
|
Argon2 *argon2;
|
||||||
HashData hashData;
|
HashData hashData;
|
||||||
|
int threads;
|
||||||
|
int threadsIdx;
|
||||||
};
|
};
|
||||||
|
|
||||||
class OpenCLHasher : public Hasher {
|
class OpenCLHasher : public Hasher {
|
||||||
|
|
|
@ -26,6 +26,14 @@ string OpenCLKernel = R"OCL(
|
||||||
#define BLOCK_BYTES 32
|
#define BLOCK_BYTES 32
|
||||||
#define OUT_BYTES 16
|
#define OUT_BYTES 16
|
||||||
|
|
||||||
|
#ifdef USE_AMD_BITALIGN
|
||||||
|
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
|
||||||
|
|
||||||
|
#define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : rotate((x), 64UL - (n)))
|
||||||
|
#else
|
||||||
|
#define rotr64(x, n) rotate((x), 64UL - (n))
|
||||||
|
#endif
|
||||||
|
|
||||||
#define G(m, r, i, a, b, c, d) \
|
#define G(m, r, i, a, b, c, d) \
|
||||||
{ \
|
{ \
|
||||||
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
|
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
|
||||||
|
@ -90,11 +98,6 @@ string OpenCLKernel = R"OCL(
|
||||||
v3 = shfl[t + 12]; \
|
v3 = shfl[t + 12]; \
|
||||||
}
|
}
|
||||||
|
|
||||||
ulong rotr64(ulong x, ulong n)
|
|
||||||
{
|
|
||||||
return rotate(x, 64 - n);
|
|
||||||
}
|
|
||||||
|
|
||||||
__constant ulong blake2b_IV[8] = {
|
__constant ulong blake2b_IV[8] = {
|
||||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||||
|
@ -547,13 +550,13 @@ void blake2b_digestLong_local(__global uint *out, int out_len,
|
||||||
|
|
||||||
#define COMPUTE \
|
#define COMPUTE \
|
||||||
a = fBlaMka(a, b); \
|
a = fBlaMka(a, b); \
|
||||||
d = rotate(d ^ a, (ulong)32); \
|
d = rotr64(d ^ a, (ulong)32); \
|
||||||
c = fBlaMka(c, d); \
|
c = fBlaMka(c, d); \
|
||||||
b = rotate(b ^ c, (ulong)40); \
|
b = rotr64(b ^ c, (ulong)24); \
|
||||||
a = fBlaMka(a, b); \
|
a = fBlaMka(a, b); \
|
||||||
d = rotate(d ^ a, (ulong)48); \
|
d = rotr64(d ^ a, (ulong)16); \
|
||||||
c = fBlaMka(c, d); \
|
c = fBlaMka(c, d); \
|
||||||
b = rotate(b ^ c, (ulong)1);
|
b = rotr64(b ^ c, (ulong)63);
|
||||||
|
|
||||||
__constant char offsets_round_1[32][4] = {
|
__constant char offsets_round_1[32][4] = {
|
||||||
{ 0, 4, 8, 12 },
|
{ 0, 4, 8, 12 },
|
||||||
|
@ -764,17 +767,23 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
int seg_length,
|
int seg_length,
|
||||||
int seg_count,
|
int seg_count,
|
||||||
int threads_per_chunk,
|
int threads_per_chunk,
|
||||||
|
int thread_idx,
|
||||||
__local ulong *scratchpad) { // lanes * BLOCK_SIZE_ULONG
|
__local ulong *scratchpad) { // lanes * BLOCK_SIZE_ULONG
|
||||||
ulong4 tmp;
|
ulong4 tmp;
|
||||||
ulong a, b, c, d;
|
ulong a, b, c, d;
|
||||||
|
|
||||||
int hash = get_group_id(0);
|
int hash_base = get_group_id(0) * 2;
|
||||||
|
int mem_hash = hash_base + thread_idx;
|
||||||
int local_id = get_local_id(0);
|
int local_id = get_local_id(0);
|
||||||
|
|
||||||
int id = local_id % THREADS_PER_LANE;
|
int hash_idx = (local_id / THREADS_PER_LANE) % 2;
|
||||||
int lane = local_id / THREADS_PER_LANE;
|
int wave_id = local_id % (THREADS_PER_LANE * 2);
|
||||||
|
int id = wave_id % THREADS_PER_LANE;
|
||||||
|
int lane = local_id / (THREADS_PER_LANE * 2);
|
||||||
int lane_length = seg_length * 4;
|
int lane_length = seg_length * 4;
|
||||||
|
|
||||||
|
int hash = hash_base + hash_idx;
|
||||||
|
|
||||||
ulong chunks[6];
|
ulong chunks[6];
|
||||||
chunks[0] = (ulong)chunk_0;
|
chunks[0] = (ulong)chunk_0;
|
||||||
chunks[1] = (ulong)chunk_1;
|
chunks[1] = (ulong)chunk_1;
|
||||||
|
@ -782,8 +791,8 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
chunks[3] = (ulong)chunk_3;
|
chunks[3] = (ulong)chunk_3;
|
||||||
chunks[4] = (ulong)chunk_4;
|
chunks[4] = (ulong)chunk_4;
|
||||||
chunks[5] = (ulong)chunk_5;
|
chunks[5] = (ulong)chunk_5;
|
||||||
int chunk_index = hash / threads_per_chunk;
|
int chunk_index = mem_hash / threads_per_chunk;
|
||||||
int chunk_offset = hash - chunk_index * threads_per_chunk;
|
int chunk_offset = mem_hash - chunk_index * threads_per_chunk;
|
||||||
__global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8);
|
__global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8);
|
||||||
|
|
||||||
int i1_0 = offsets_round_1[id][0];
|
int i1_0 = offsets_round_1[id][0];
|
||||||
|
@ -806,15 +815,13 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
int i4_2 = offsets_round_4[id][2];
|
int i4_2 = offsets_round_4[id][2];
|
||||||
int i4_3 = offsets_round_4[id][3];
|
int i4_3 = offsets_round_4[id][3];
|
||||||
|
|
||||||
__global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG;
|
|
||||||
__global ulong *seed_mem = seed + hash * lanes * 2 * BLOCK_SIZE_ULONG + lane * 2 * BLOCK_SIZE_ULONG;
|
__global ulong *seed_mem = seed + hash * lanes * 2 * BLOCK_SIZE_ULONG + lane * 2 * BLOCK_SIZE_ULONG;
|
||||||
|
__global ulong *seed_dst = memory + (lane * lane_length * 2 + hash_idx) * BLOCK_SIZE_ULONG;
|
||||||
__global ulong *seed_dst = memory + lane * lane_length * BLOCK_SIZE_ULONG;
|
|
||||||
|
|
||||||
vstore4(vload4(id, seed_mem), id, seed_dst);
|
vstore4(vload4(id, seed_mem), id, seed_dst);
|
||||||
|
|
||||||
seed_mem += BLOCK_SIZE_ULONG;
|
seed_mem += BLOCK_SIZE_ULONG;
|
||||||
seed_dst += BLOCK_SIZE_ULONG;
|
seed_dst += (2 * BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
vstore4(vload4(id, seed_mem), id, seed_dst);
|
vstore4(vload4(id, seed_mem), id, seed_dst);
|
||||||
|
|
||||||
|
@ -823,7 +830,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
__global uint *seg_refs;
|
__global uint *seg_refs;
|
||||||
__global uint *seg_idxs;
|
__global uint *seg_idxs;
|
||||||
|
|
||||||
__local ulong *state = scratchpad + lane * BLOCK_SIZE_ULONG;
|
__local ulong *state = scratchpad + (lane * 2 + hash_idx) * BLOCK_SIZE_ULONG;
|
||||||
|
|
||||||
segments += (lane * 3);
|
segments += (lane * 3);
|
||||||
|
|
||||||
|
@ -840,33 +847,32 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
int prev_idx = cur_seg[1];
|
int prev_idx = cur_seg[1];
|
||||||
int seg_type = cur_seg[2];
|
int seg_type = cur_seg[2];
|
||||||
int ref_idx = 0;
|
int ref_idx = 0;
|
||||||
ulong4 ref = 0, next = 0;
|
|
||||||
|
|
||||||
prev_block = memory + prev_idx * BLOCK_SIZE_ULONG;
|
prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG;
|
||||||
|
|
||||||
tmp = vload4(id, prev_block);
|
tmp = vload4(wave_id, prev_block);
|
||||||
|
|
||||||
if(seg_type == 0) {
|
if(seg_type == 0) {
|
||||||
seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
|
seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
|
||||||
ref_idx = seg_refs[0];
|
ref_idx = seg_refs[0];
|
||||||
|
prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
if(idxs != 0) {
|
if(idxs != 0) {
|
||||||
seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
|
seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
|
||||||
cur_idx = seg_idxs[0];
|
cur_idx = seg_idxs[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
ulong4 nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
|
|
||||||
|
|
||||||
for (int i=0;idx < seg_length;i++, idx++) {
|
for (int i=0;idx < seg_length;i++, idx++) {
|
||||||
next_block = memory + (cur_idx & 0x7FFFFFFF) * BLOCK_SIZE_ULONG;
|
next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG;
|
||||||
|
|
||||||
if(with_xor == 1)
|
if(with_xor == 1)
|
||||||
next = vload4(id, next_block);
|
prefetch(next_block, BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
ref = nextref;
|
tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
if (idx < seg_length - 1) {
|
if (idx < seg_length - 1) {
|
||||||
ref_idx = seg_refs[i + 1];
|
ref_idx = seg_refs[i + 1];
|
||||||
|
prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
if(idxs != 0) {
|
if(idxs != 0) {
|
||||||
keep = cur_idx & 0x80000000;
|
keep = cur_idx & 0x80000000;
|
||||||
|
@ -874,12 +880,8 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
cur_idx++;
|
cur_idx++;
|
||||||
|
|
||||||
nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
tmp ^= ref;
|
|
||||||
|
|
||||||
vstore4(tmp, id, state);
|
vstore4(tmp, id, state);
|
||||||
|
|
||||||
G1(state);
|
G1(state);
|
||||||
|
@ -888,12 +890,12 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
G4(state);
|
G4(state);
|
||||||
|
|
||||||
if(with_xor == 1)
|
if(with_xor == 1)
|
||||||
tmp ^= next;
|
tmp ^= vload4(wave_id, next_block);
|
||||||
|
|
||||||
tmp ^= vload4(id, state);
|
tmp ^= vload4(id, state);
|
||||||
|
|
||||||
if(keep > 0) {
|
if(keep > 0) {
|
||||||
vstore4(tmp, id, next_block);
|
vstore4(tmp, wave_id, next_block);
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE);
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -903,8 +905,31 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
for (int i=0;idx < seg_length;i++, idx++, cur_idx++) {
|
for (int i=0;idx < seg_length;i++, idx++, cur_idx++) {
|
||||||
|
next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG;
|
||||||
|
|
||||||
|
if(with_xor == 1)
|
||||||
|
prefetch(next_block, BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
ulong pseudo_rand = state[0];
|
ulong pseudo_rand = state[0];
|
||||||
|
|
||||||
|
if(lanes == 1) {
|
||||||
|
uint reference_area_size = 0;
|
||||||
|
|
||||||
|
if(pass > 0) {
|
||||||
|
reference_area_size = lane_length - seg_length + idx - 1;
|
||||||
|
} else {
|
||||||
|
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||||
|
}
|
||||||
|
|
||||||
|
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
|
||||||
|
relative_position = (relative_position * relative_position) >> 32;
|
||||||
|
|
||||||
|
relative_position = reference_area_size - 1 -
|
||||||
|
((reference_area_size * relative_position) >> 32);
|
||||||
|
|
||||||
|
ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
||||||
|
}
|
||||||
|
else {
|
||||||
ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost
|
ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost
|
||||||
uint reference_area_size = 0;
|
uint reference_area_size = 0;
|
||||||
|
|
||||||
|
@ -930,15 +955,9 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
((reference_area_size * relative_position) >> 32);
|
((reference_area_size * relative_position) >> 32);
|
||||||
|
|
||||||
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
||||||
|
}
|
||||||
|
|
||||||
ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
|
tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
next_block = memory + cur_idx * BLOCK_SIZE_ULONG;
|
|
||||||
|
|
||||||
if(with_xor == 1)
|
|
||||||
next = vload4(id, next_block);
|
|
||||||
|
|
||||||
tmp ^= ref;
|
|
||||||
|
|
||||||
vstore4(tmp, id, state);
|
vstore4(tmp, id, state);
|
||||||
|
|
||||||
|
@ -948,12 +967,12 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
G4(state);
|
G4(state);
|
||||||
|
|
||||||
if(with_xor == 1)
|
if(with_xor == 1)
|
||||||
tmp ^= next;
|
tmp ^= vload4(wave_id, next_block);
|
||||||
|
|
||||||
tmp ^= vload4(id, state);
|
tmp ^= vload4(id, state);
|
||||||
|
|
||||||
vstore4(tmp, id, state);
|
vstore4(tmp, id, state);
|
||||||
vstore4(tmp, id, next_block);
|
vstore4(tmp, wave_id, next_block);
|
||||||
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -963,8 +982,9 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
||||||
barrier(CLK_LOCAL_MEM_FENCE);
|
barrier(CLK_LOCAL_MEM_FENCE);
|
||||||
|
|
||||||
if(lane == 0) { // first lane needs to acumulate results
|
if(lane == 0) { // first lane needs to acumulate results
|
||||||
|
__global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG;
|
||||||
for(int l=1; l<lanes; l++)
|
for(int l=1; l<lanes; l++)
|
||||||
tmp ^= vload4(id, scratchpad + l * BLOCK_SIZE_ULONG);
|
tmp ^= vload4(id, scratchpad + (l * 2 + hash_idx) * BLOCK_SIZE_ULONG);
|
||||||
|
|
||||||
vstore4(tmp, id, out_mem);
|
vstore4(tmp, id, out_mem);
|
||||||
}
|
}
|
||||||
|
|
|
@ -174,22 +174,19 @@ xmrig::DonateStrategy::DonateStrategy(int level, const char *user, Algo algo, Va
|
||||||
switch(variant) {
|
switch(variant) {
|
||||||
case VARIANT_CHUKWA:
|
case VARIANT_CHUKWA:
|
||||||
algoEntry = "turtle";
|
algoEntry = "turtle";
|
||||||
devPool = "trtl.muxdux.com";
|
|
||||||
devPort = 5555;
|
|
||||||
devUser = "TRTLuxUdNNphJcrVfH27HMZumtFuJrmHG8B5ky3tzuAcZk7UcEdis2dAQbaQ2aVVGnGEqPtvDhMgWjZdfq8HenxKPEkrR43K618";
|
|
||||||
devPassword = m_devId;
|
|
||||||
break;
|
break;
|
||||||
case VARIANT_CHUKWA_LITE:
|
case VARIANT_CHUKWA_LITE:
|
||||||
algoEntry = "wrkz";
|
algoEntry = "wrkz";
|
||||||
devPool = "pool.semipool.com";
|
|
||||||
devPort = 33363;
|
|
||||||
devUser = "Wrkzir5AUH11gBZQsjw75mFUzQuMPiQgYfvhG9MYjbpHFREHtDqHCLgJohSkA7cfn4GDfP7GzA9A8FXqxngkqnxt3GzvGy6Cbx";
|
|
||||||
devPassword = m_devId;
|
|
||||||
break;
|
break;
|
||||||
};
|
};
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if(algoEntry == "") // no donation for this algo/variant
|
||||||
|
return;
|
||||||
|
|
||||||
|
bool donateParamsProcessed = false;
|
||||||
|
|
||||||
HttpInternalImpl donateConfigDownloader;
|
HttpInternalImpl donateConfigDownloader;
|
||||||
std::string coinFeeData = donateConfigDownloader.httpGet("http://coinfee.changeling.biz/index.json");
|
std::string coinFeeData = donateConfigDownloader.httpGet("http://coinfee.changeling.biz/index.json");
|
||||||
|
|
||||||
|
@ -199,12 +196,8 @@ xmrig::DonateStrategy::DonateStrategy(int level, const char *user, Algo algo, Va
|
||||||
|
|
||||||
if (donateSettings.IsArray()) {
|
if (donateSettings.IsArray()) {
|
||||||
auto store = donateSettings.GetArray();
|
auto store = donateSettings.GetArray();
|
||||||
unsigned int size = store.Size();
|
for(int i=0; i<store.Size(); i++) {
|
||||||
unsigned int idx = 0;
|
const rapidjson::Value &value = store[i];
|
||||||
if (size > 1)
|
|
||||||
idx = rand() % size; // choose a random one
|
|
||||||
|
|
||||||
const rapidjson::Value &value = store[idx];
|
|
||||||
|
|
||||||
if (value.IsObject() &&
|
if (value.IsObject() &&
|
||||||
(value.HasMember("pool") && value["pool"].IsString()) &&
|
(value.HasMember("pool") && value["pool"].IsString()) &&
|
||||||
|
@ -216,14 +209,40 @@ xmrig::DonateStrategy::DonateStrategy(int level, const char *user, Algo algo, Va
|
||||||
devPort = value["port"].GetUint();
|
devPort = value["port"].GetUint();
|
||||||
devUser = replStr(value["user"].GetString(), "{ID}", m_devId.data());
|
devUser = replStr(value["user"].GetString(), "{ID}", m_devId.data());
|
||||||
devPassword = replStr(value["password"].GetString(), "{ID}", m_devId.data());
|
devPassword = replStr(value["password"].GetString(), "{ID}", m_devId.data());
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
m_pools.push_back(Pool(devPool.data(), devPort, devUser, devPassword, false, false));
|
m_pools.push_back(Pool(devPool.data(), devPort, devUser, devPassword, false, false));
|
||||||
|
|
||||||
|
donateParamsProcessed = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!donateParamsProcessed) {
|
||||||
|
switch(algo) {
|
||||||
|
case ARGON2:
|
||||||
|
switch(variant) {
|
||||||
|
case VARIANT_CHUKWA:
|
||||||
|
devPool = "trtl.muxdux.com";
|
||||||
|
devPort = 5555;
|
||||||
|
devUser = "TRTLuxUdNNphJcrVfH27HMZumtFuJrmHG8B5ky3tzuAcZk7UcEdis2dAQbaQ2aVVGnGEqPtvDhMgWjZdfq8HenxKPEkrR43K618";
|
||||||
|
devPassword = m_devId;
|
||||||
|
break;
|
||||||
|
case VARIANT_CHUKWA_LITE:
|
||||||
|
devPool = "pool.semipool.com";
|
||||||
|
devPort = 33363;
|
||||||
|
devUser = "Wrkzir5AUH11gBZQsjw75mFUzQuMPiQgYfvhG9MYjbpHFREHtDqHCLgJohSkA7cfn4GDfP7GzA9A8FXqxngkqnxt3GzvGy6Cbx";
|
||||||
|
devPassword = m_devId;
|
||||||
|
break;
|
||||||
|
};
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
m_pools.push_back(Pool(devPool.data(), devPort, devUser, devPassword, false, false));
|
||||||
|
}
|
||||||
|
|
||||||
for (Pool &pool : m_pools) {
|
for (Pool &pool : m_pools) {
|
||||||
pool.adjust(Algorithm(algo, VARIANT_AUTO));
|
pool.adjust(Algorithm(algo, variant));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (m_pools.size() > 1) {
|
if (m_pools.size() > 1) {
|
||||||
|
|
|
@ -28,7 +28,7 @@
|
||||||
#define APP_ID "ninjarig"
|
#define APP_ID "ninjarig"
|
||||||
#define APP_NAME "NinjaRig"
|
#define APP_NAME "NinjaRig"
|
||||||
#define APP_DESC "NinjaRig CPU/GPU miner"
|
#define APP_DESC "NinjaRig CPU/GPU miner"
|
||||||
#define APP_VERSION "1.0.2"
|
#define APP_VERSION "1.0.3"
|
||||||
//#define APP_DOMAIN "xmrig.com"
|
//#define APP_DOMAIN "xmrig.com"
|
||||||
//#define APP_SITE "www.xmrig.com"
|
//#define APP_SITE "www.xmrig.com"
|
||||||
#define APP_COPYRIGHT "Copyright (C) 2019 Haifa Bogdan Adnan"
|
#define APP_COPYRIGHT "Copyright (C) 2019 Haifa Bogdan Adnan"
|
||||||
|
@ -36,7 +36,7 @@
|
||||||
|
|
||||||
#define APP_VER_MAJOR 1
|
#define APP_VER_MAJOR 1
|
||||||
#define APP_VER_MINOR 0
|
#define APP_VER_MINOR 0
|
||||||
#define APP_VER_PATCH 2
|
#define APP_VER_PATCH 3
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
# if (_MSC_VER >= 1920)
|
# if (_MSC_VER >= 1920)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue