Optimizations to OpenCL kernel - added optional support for amd_bitalign, added special case for lanes = 1 in index calculation for argon2d.

This commit is contained in:
Haifa Bogdan Adnan 2019-09-07 22:39:57 +03:00
parent d8daeda7ba
commit bfb397f217
4 changed files with 113 additions and 51 deletions

View file

@ -589,29 +589,47 @@ __global__ void fill_blocks(uint32_t *scratchpad0,
}
uint32_t pseudo_rand_lo = __shfl_sync(0xffffffff, tmp_a.x, 0);
uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0);
uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost
uint32_t reference_area_size = 0;
if(pass > 0) {
if (lane == ref_lane) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
}
}
if(lanes > 1) {
uint32_t pseudo_rand_hi = __shfl_sync(0xffffffff, tmp_a.y, 0);
uint64_t ref_lane = pseudo_rand_hi % lanes; // thr_cost
uint32_t reference_area_size = 0;
if (pass > 0) {
if (lane == ref_lane) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
}
} else {
if (lane == ref_lane) {
reference_area_size = slice * seg_length + idx - 1; // seg_length
} else {
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
}
}
asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size));
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
ref_idx = ref_lane * lane_length +
(((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) %
lane_length;
}
else {
if (lane == ref_lane) {
reference_area_size = slice * seg_length + idx - 1; // seg_length
} else {
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
}
uint32_t reference_area_size = 0;
if (pass > 0) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = slice * seg_length + idx - 1; // seg_length
}
asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size));
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) %
lane_length;
}
asm("{mul.hi.u32 %0, %1, %1; mul.hi.u32 %0, %0, %2; }": "=r"(pseudo_rand_lo) : "r"(pseudo_rand_lo), "r"(reference_area_size));
uint32_t relative_position = reference_area_size - 1 - pseudo_rand_lo;
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
ref_block = memory + ref_idx * BLOCK_SIZE_UINT4;

View file

@ -196,6 +196,24 @@ OpenCLDeviceInfo *OpenCLHasher::getDeviceInfo(cl_platform_id platform, cl_device
device_info->deviceString = device_vendor + " - " + device_name/* + " : " + device_version*/;
string extensions;
sz = 0;
clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, 0, NULL, &sz);
buffer = (char *)malloc(sz + 1);
device_info->error = clGetDeviceInfo(device, CL_DEVICE_EXTENSIONS, sz, buffer, &sz);
if(device_info->error != CL_SUCCESS) {
free(buffer);
device_info->errorMessage = "Error querying device extensions.";
return device_info;
}
else {
buffer[sz] = 0;
extensions = buffer;
free(buffer);
}
device_info->deviceExtensions = extensions;
device_info->error = clGetDeviceInfo(device, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(device_info->maxMemSize), &(device_info->maxMemSize), NULL);
if(device_info->error != CL_SUCCESS) {
device_info->errorMessage = "Error querying device global memory size.";
@ -362,7 +380,10 @@ bool OpenCLHasher::setupDeviceInfo(OpenCLDeviceInfo *device, double intensity) {
return false;
}
error = clBuildProgram(device->program, 1, &device->device, "", NULL, NULL);
string options = "";
if(device->deviceExtensions.find("cl_amd_media_ops") != string::npos)
options += "-D USE_AMD_BITALIGN";
error = clBuildProgram(device->program, 1, &device->device, options.c_str(), NULL, NULL);
if (error != CL_SUCCESS) {
size_t log_size;
clGetProgramBuildInfo(device->program, device->device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size);

View file

@ -64,6 +64,7 @@ struct OpenCLDeviceInfo {
Argon2ProfileInfo profileInfo;
string deviceString;
string deviceExtensions;
uint64_t maxMemSize;
uint64_t maxAllocableMemSize;

View file

@ -26,6 +26,14 @@ string OpenCLKernel = R"OCL(
#define BLOCK_BYTES 32
#define OUT_BYTES 16
#ifdef USE_AMD_BITALIGN
#pragma OPENCL EXTENSION cl_amd_media_ops : enable
#define rotr64(x, n) ((n) < 32 ? (amd_bitalign((uint)((x) >> 32), (uint)(x), (uint)(n)) | ((ulong)amd_bitalign((uint)(x), (uint)((x) >> 32), (uint)(n)) << 32)) : rotate((x), 64UL - (n)))
#else
#define rotr64(x, n) rotate((x), 64UL - (n))
#endif
#define G(m, r, i, a, b, c, d) \
{ \
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
@ -90,11 +98,6 @@ string OpenCLKernel = R"OCL(
v3 = shfl[t + 12]; \
}
ulong rotr64(ulong x, ulong n)
{
return rotate(x, 64 - n);
}
__constant ulong blake2b_IV[8] = {
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
@ -547,13 +550,13 @@ void blake2b_digestLong_local(__global uint *out, int out_len,
#define COMPUTE \
a = fBlaMka(a, b); \
d = rotate(d ^ a, (ulong)32); \
d = rotr64(d ^ a, (ulong)32); \
c = fBlaMka(c, d); \
b = rotate(b ^ c, (ulong)40); \
b = rotr64(b ^ c, (ulong)24); \
a = fBlaMka(a, b); \
d = rotate(d ^ a, (ulong)48); \
d = rotr64(d ^ a, (ulong)16); \
c = fBlaMka(c, d); \
b = rotate(b ^ c, (ulong)1);
b = rotr64(b ^ c, (ulong)63);
__constant char offsets_round_1[32][4] = {
{ 0, 4, 8, 12 },
@ -905,31 +908,50 @@ __kernel void fill_blocks(__global ulong *chunk_0,
for (int i=0;idx < seg_length;i++, idx++, cur_idx++) {
ulong pseudo_rand = state[0];
ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost
uint reference_area_size = 0;
if(lanes == 1) {
uint reference_area_size = 0;
if(pass > 0) {
if (lane == ref_lane) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
}
}
else {
if (lane == ref_lane) {
reference_area_size = slice * seg_length + idx - 1; // seg_length
} else {
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
}
}
if(pass > 0) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = slice * seg_length + idx - 1; // seg_length
}
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
relative_position = (relative_position * relative_position) >> 32;
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
relative_position = (relative_position * relative_position) >> 32;
relative_position = reference_area_size - 1 -
((reference_area_size * relative_position) >> 32);
relative_position = reference_area_size - 1 -
((reference_area_size * relative_position) >> 32);
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
ref_idx = (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
}
else {
ulong ref_lane = ((pseudo_rand >> 32)) % lanes; // thr_cost
uint reference_area_size = 0;
if(pass > 0) {
if (lane == ref_lane) {
reference_area_size = lane_length - seg_length + idx - 1;
} else {
reference_area_size = lane_length - seg_length + ((idx == 0) ? (-1) : 0);
}
}
else {
if (lane == ref_lane) {
reference_area_size = slice * seg_length + idx - 1; // seg_length
} else {
reference_area_size = slice * seg_length + ((idx == 0) ? (-1) : 0);
}
}
ulong relative_position = pseudo_rand & 0xFFFFFFFF;
relative_position = (relative_position * relative_position) >> 32;
relative_position = reference_area_size - 1 -
((reference_area_size * relative_position) >> 32);
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
}
ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);