OpenCL kernel optimization - use prefetch instruction for preloading.

2019-09-08 08:40:57 +03:00 · 2019-09-08 08:40:57 +03:00 · 0d9d687c3d
commit 0d9d687c3d
parent f4de892742
1 changed files with 8 additions and 13 deletions
--- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp
+++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp
@ -847,7 +847,6 @@ __kernel void fill_blocks(__global ulong *chunk_0,
        int prev_idx = cur_seg[1];
        int seg_type = cur_seg[2];
        int ref_idx = 0;
        ulong4 ref = 0, next = 0;
 		prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG;
@ -856,24 +855,24 @@ __kernel void fill_blocks(__global ulong *chunk_0,
        if(seg_type == 0) {
            seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
            ref_idx = seg_refs[0];
            prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
            if(idxs != 0) {
                seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
                cur_idx = seg_idxs[0];
            }
            ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
            for (int i=0;idx < seg_length;i++, idx++) {
    			next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG;
                if(with_xor == 1)
-                    next = vload4(wave_id, next_block);
+                    prefetch(next_block, BLOCK_SIZE_ULONG);
-                tmp ^= ref;
+                tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
                if (idx < seg_length - 1) {
                    ref_idx = seg_refs[i + 1];
                    prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
                    if(idxs != 0) {
                        keep = cur_idx & 0x80000000;
@ -881,8 +880,6 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                    }
                    else
                        cur_idx++;
                    ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
                }
                vstore4(tmp, id, state);
@ -893,7 +890,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                G4(state);
                if(with_xor == 1)
-                    tmp ^= next;
+                    tmp ^= vload4(wave_id, next_block);
                tmp ^= vload4(id, state);
@ -911,7 +908,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
    			next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG;
                if(with_xor == 1)
-                    next = vload4(wave_id, next_block);
+                    prefetch(next_block, BLOCK_SIZE_ULONG);
                ulong pseudo_rand = state[0];
@ -960,9 +957,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
    				ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
                }
-        		ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
+                tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
                tmp ^= ref;
                vstore4(tmp, id, state);
@ -972,7 +967,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                G4(state);
                if(with_xor == 1)
-                    tmp ^= next;
+                    tmp ^= vload4(wave_id, next_block);
                tmp ^= vload4(id, state);