Optimizations to OpenCL kernel - resize workgroup to 64 threads to fully use AMD wavefront size.

2019-09-07 23:28:37 +03:00 · 2019-09-07 23:28:37 +03:00 · 4e0d75b611
commit 4e0d75b611
parent 33c218f7bf
2 changed files with 27 additions and 24 deletions
--- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp
+++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp
@ -760,9 +760,9 @@ void *opencl_kernel_filler(int threads, Argon2Profile *profile, void *user_data)
    cl_int error;

 	size_t total_work_items = threads * KERNEL_WORKGROUP_SIZE * profile->thrCost;
-	size_t local_work_items = KERNEL_WORKGROUP_SIZE * profile->thrCost;
+	size_t local_work_items = 2 * KERNEL_WORKGROUP_SIZE * profile->thrCost;

-    size_t shared_mem = profile->thrCost * ARGON2_QWORDS_IN_BLOCK;
+    size_t shared_mem = 2 * profile->thrCost * ARGON2_QWORDS_IN_BLOCK;

    clSetKernelArg(device->kernelFillBlocks, 6, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]);
    clSetKernelArg(device->kernelFillBlocks, 7, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]);
--- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp
+++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp
@ -771,13 +771,17 @@ __kernel void fill_blocks(__global ulong *chunk_0,
    ulong4 tmp;
 	ulong a, b, c, d;

-	int hash = get_group_id(0);
+	int hash_base = get_group_id(0) * 2;
 	int local_id = get_local_id(0);

-	int id = local_id % THREADS_PER_LANE;
-	int lane = local_id / THREADS_PER_LANE;
+    int hash_idx = (local_id / THREADS_PER_LANE) % 2;
+    int wave_id = local_id % (THREADS_PER_LANE * 2);
+	int id = wave_id % THREADS_PER_LANE;
+	int lane = local_id / (THREADS_PER_LANE * 2);
 	int lane_length = seg_length * 4;

+    int hash = hash_base + hash_idx;
+
 	ulong chunks[6];
 	chunks[0] = (ulong)chunk_0;
 	chunks[1] = (ulong)chunk_1;
@ -785,8 +789,8 @@ __kernel void fill_blocks(__global ulong *chunk_0,
 	chunks[3] = (ulong)chunk_3;
 	chunks[4] = (ulong)chunk_4;
 	chunks[5] = (ulong)chunk_5;
-	int chunk_index = hash / threads_per_chunk;
-	int chunk_offset = hash - chunk_index * threads_per_chunk;
+	int chunk_index = hash_base / threads_per_chunk;
+	int chunk_offset = hash_base - chunk_index * threads_per_chunk;
 	__global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8);

 	int i1_0 = offsets_round_1[id][0];
@ -809,15 +813,13 @@ __kernel void fill_blocks(__global ulong *chunk_0,
 	int i4_2 = offsets_round_4[id][2];
 	int i4_3 = offsets_round_4[id][3];

-	__global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG;
 	__global ulong *seed_mem = seed + hash * lanes * 2 * BLOCK_SIZE_ULONG + lane * 2 * BLOCK_SIZE_ULONG;
-
-	__global ulong *seed_dst = memory + lane * lane_length * BLOCK_SIZE_ULONG;
+	__global ulong *seed_dst = memory + (lane * lane_length * 2 + hash_idx) * BLOCK_SIZE_ULONG;

 	vstore4(vload4(id, seed_mem), id, seed_dst);

 	seed_mem += BLOCK_SIZE_ULONG;
-	seed_dst += BLOCK_SIZE_ULONG;
+	seed_dst += (2 * BLOCK_SIZE_ULONG);

 	vstore4(vload4(id, seed_mem), id, seed_dst);

@ -826,7 +828,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
    __global uint *seg_refs;
    __global uint *seg_idxs;

-	__local ulong *state = scratchpad + lane * BLOCK_SIZE_ULONG;
+	__local ulong *state = scratchpad + (lane * 2 + hash_idx) * BLOCK_SIZE_ULONG;

 	segments += (lane * 3);

@ -845,9 +847,9 @@ __kernel void fill_blocks(__global ulong *chunk_0,
        int ref_idx = 0;
        ulong4 ref = 0, next = 0;

-		prev_block = memory + prev_idx * BLOCK_SIZE_ULONG;
+		prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG;

-		tmp = vload4(id, prev_block);
+		tmp = vload4(wave_id, prev_block);

        if(seg_type == 0) {
            seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
@ -858,13 +860,13 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                cur_idx = seg_idxs[0];
            }

-            ulong4 nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
+            ulong4 nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);

            for (int i=0;idx < seg_length;i++, idx++) {
-    			next_block = memory + (cur_idx & 0x7FFFFFFF) * BLOCK_SIZE_ULONG;
+    			next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG;

                if(with_xor == 1)
-                    next = vload4(id, next_block);
+                    next = vload4(wave_id, next_block);

                ref = nextref;

@ -878,7 +880,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                    else
                        cur_idx++;

-                    nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
+                    nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
                }

                tmp ^= ref;
@ -896,7 +898,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                tmp ^= vload4(id, state);

                if(keep > 0) {
-                    vstore4(tmp, id, next_block);
+                    vstore4(tmp, wave_id, next_block);
                    barrier(CLK_GLOBAL_MEM_FENCE);
                }
            }
@ -953,12 +955,12 @@ __kernel void fill_blocks(__global ulong *chunk_0,
    				ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
                }

-        		ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
+        		ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);

-    			next_block = memory + cur_idx * BLOCK_SIZE_ULONG;
+    			next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG;

                if(with_xor == 1)
-                    next = vload4(id, next_block);
+                    next = vload4(wave_id, next_block);

                tmp ^= ref;

@ -975,7 +977,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                tmp ^= vload4(id, state);

                vstore4(tmp, id, state);
-                vstore4(tmp, id, next_block);
+                vstore4(tmp, wave_id, next_block);
                barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
            }
        }
@ -985,8 +987,9 @@ __kernel void fill_blocks(__global ulong *chunk_0,
    barrier(CLK_LOCAL_MEM_FENCE);

 	if(lane == 0) { // first lane needs to acumulate results
+    	__global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG;
 		for(int l=1; l<lanes; l++)
-            tmp ^= vload4(id, scratchpad + l * BLOCK_SIZE_ULONG);
+            tmp ^= vload4(id, scratchpad + (l * 2 + hash_idx) * BLOCK_SIZE_ULONG);

        vstore4(tmp, id, out_mem);
 	}