Optimizations to OpenCL kernel - added optional support for amd_bitalign, added special case for lanes = 1 in index calculation for argon2d.
This commit is contained in:
parent
d8daeda7ba
commit
bfb397f217
4 changed files with 113 additions and 51 deletions
|
@ -589,29 +589,47 @@ __global__ void fill_blocks(uint32_t *scratchpad0,
|
|||
}
|
||||
|
||||
uint32_t pseudo_rand_lo = __shfl_sync(0xffffffff, tmp_a.x, 0);
|
||||
uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0);
|
||||
|
||||
uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost
|
||||
uint32_t reference_area_size = 0;
|
||||
if(pass > 0) {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = lane_length - seg_length + idx - 1;
|
||||
} else {
|
||||
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
if(lanes > 1) {
|
||||
uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0);
|
||||
|
||||
uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost
|
||||
uint32_t reference_area_size = 0;
|
||||
if (pass > 0) {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = lane_length - seg_length + idx - 1;
|
||||
} else {
|
||||
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
} else {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||
} else {
|
||||
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size));
|
||||
|
||||
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
|
||||
|
||||
ref_idx = ref_lane * lane_length +
|
||||
(((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) %
|
||||
lane_length;
|
||||
}
|
||||
else {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||
} else {
|
||||
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
uint32_t reference_area_size = 0;
|
||||
if (pass > 0) {
|
||||
reference_area_size = lane_length - seg_length + idx - 1;
|
||||
} else {
|
||||
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||
}
|
||||
asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size));
|
||||
|
||||
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
|
||||
|
||||
ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) %
|
||||
lane_length;
|
||||
}
|
||||
asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size));
|
||||
|
||||
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
|
||||
|
||||
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
||||
|
||||
ref_block = memory + ref_idx * BLOCK_SIZE_UINT4;
|
||||
|
||||
|
|
|
@ -196,6 +196,24 @@ OpenCLDeviceInfo *OpenCLHasher::getDeviceInfo(cl_platform_id platform, cl_device
|
|||
|
||||
device_info->deviceString = device_vendor + " - " + device_name/* + " : " + device_version*/;
|
||||
|
||||
string extensions;
|
||||
sz = 0;
|
||||
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &sz);
|
||||
buffer = (char *)malloc(sz + 1);
|
||||
device_info->error = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sz, buffer, &sz);
|
||||
if(device_info->error != CL_SUCCESS) {
|
||||
free(buffer);
|
||||
device_info->errorMessage = "Error querying device extensions.";
|
||||
return device_info;
|
||||
}
|
||||
else {
|
||||
buffer[sz] = 0;
|
||||
extensions = buffer;
|
||||
free(buffer);
|
||||
}
|
||||
|
||||
device_info->deviceExtensions = extensions;
|
||||
|
||||
device_info->error = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_info->maxMemSize), &(device_info->maxMemSize), NULL);
|
||||
if(device_info->error != CL_SUCCESS) {
|
||||
device_info->errorMessage = "Error querying device global memory size.";
|
||||
|
@ -362,7 +380,10 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
|
|||
return false;
|
||||
}
|
||||
|
||||
error = clBuildProgram(device->program, 1, &device->device, "", NULL, NULL);
|
||||
string options = "";
|
||||
if(device->deviceExtensions.find("cl_amd_media_ops") != string::npos)
|
||||
options += "-D USE_AMD_BITALIGN";
|
||||
error = clBuildProgram(device->program, 1, &device->device, options.c_str(), NULL, NULL);
|
||||
if (error != CL_SUCCESS) {
|
||||
size_t log_size;
|
||||
clGetProgramBuildInfo(device->program, device->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);
|
||||
|
|
|
@ -64,6 +64,7 @@ struct OpenCLDeviceInfo {
|
|||
Argon2ProfileInfo profileInfo;
|
||||
|
||||
string deviceString;
|
||||
string deviceExtensions;
|
||||
uint64_t maxMemSize;
|
||||
uint64_t maxAllocableMemSize;
|
||||
|
||||
|
|
|
@ -26,6 +26,14 @@ string OpenCLKernel = R"OCL(
|
|||
#define BLOCK_BYTES 32
|
||||
#define OUT_BYTES 16
|
||||
|
||||
#ifdef USE_AMD_BITALIGN
|
||||
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
|
||||
|
||||
#define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : rotate((x), 64UL - (n)))
|
||||
#else
|
||||
#define rotr64(x, n) rotate((x), 64UL - (n))
|
||||
#endif
|
||||
|
||||
#define G(m, r, i, a, b, c, d) \
|
||||
{ \
|
||||
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
|
||||
|
@ -90,11 +98,6 @@ string OpenCLKernel = R"OCL(
|
|||
v3 = shfl[t + 12]; \
|
||||
}
|
||||
|
||||
ulong rotr64(ulong x, ulong n)
|
||||
{
|
||||
return rotate(x, 64 - n);
|
||||
}
|
||||
|
||||
__constant ulong blake2b_IV[8] = {
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
|
@ -547,13 +550,13 @@ void blake2b_digestLong_local(__global uint *out, int out_len,
|
|||
|
||||
#define COMPUTE \
|
||||
a = fBlaMka(a, b); \
|
||||
d = rotate(d ^ a, (ulong)32); \
|
||||
d = rotr64(d ^ a, (ulong)32); \
|
||||
c = fBlaMka(c, d); \
|
||||
b = rotate(b ^ c, (ulong)40); \
|
||||
b = rotr64(b ^ c, (ulong)24); \
|
||||
a = fBlaMka(a, b); \
|
||||
d = rotate(d ^ a, (ulong)48); \
|
||||
d = rotr64(d ^ a, (ulong)16); \
|
||||
c = fBlaMka(c, d); \
|
||||
b = rotate(b ^ c, (ulong)1);
|
||||
b = rotr64(b ^ c, (ulong)63);
|
||||
|
||||
__constant char offsets_round_1[32][4] = {
|
||||
{ 0, 4, 8, 12 },
|
||||
|
@ -905,31 +908,50 @@ __kernel void fill_blocks(__global ulong *chunk_0,
|
|||
for (int i=0;idx < seg_length;i++, idx++, cur_idx++) {
|
||||
ulong pseudo_rand = state[0];
|
||||
|
||||
ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost
|
||||
uint reference_area_size = 0;
|
||||
if(lanes == 1) {
|
||||
uint reference_area_size = 0;
|
||||
|
||||
if(pass > 0) {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = lane_length - seg_length + idx - 1;
|
||||
} else {
|
||||
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||
} else {
|
||||
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
if(pass > 0) {
|
||||
reference_area_size = lane_length - seg_length + idx - 1;
|
||||
} else {
|
||||
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||
}
|
||||
|
||||
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
|
||||
relative_position = (relative_position * relative_position) >> 32;
|
||||
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
|
||||
relative_position = (relative_position * relative_position) >> 32;
|
||||
|
||||
relative_position = reference_area_size - 1 -
|
||||
((reference_area_size * relative_position) >> 32);
|
||||
relative_position = reference_area_size - 1 -
|
||||
((reference_area_size * relative_position) >> 32);
|
||||
|
||||
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
||||
ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
||||
}
|
||||
else {
|
||||
ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost
|
||||
uint reference_area_size = 0;
|
||||
|
||||
if(pass > 0) {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = lane_length - seg_length + idx - 1;
|
||||
} else {
|
||||
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
else {
|
||||
if (lane == ref_lane) {
|
||||
reference_area_size = slice * seg_length + idx - 1; // seg_length
|
||||
} else {
|
||||
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
|
||||
}
|
||||
}
|
||||
|
||||
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
|
||||
relative_position = (relative_position * relative_position) >> 32;
|
||||
|
||||
relative_position = reference_area_size - 1 -
|
||||
((reference_area_size * relative_position) >> 32);
|
||||
|
||||
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
|
||||
}
|
||||
|
||||
ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue