diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp index cf4497b2..af817470 100644 --- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp @@ -847,7 +847,6 @@ __kernel void fill_blocks(__global ulong *chunk_0, int prev_idx = cur_seg[1]; int seg_type = cur_seg[2]; int ref_idx = 0; - ulong4 ref = 0, next = 0; prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG; @@ -856,24 +855,24 @@ __kernel void fill_blocks(__global ulong *chunk_0, if(seg_type == 0) { seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); ref_idx = seg_refs[0]; + prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG); if(idxs != 0) { seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); cur_idx = seg_idxs[0]; } - ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); - for (int i=0;idx < seg_length;i++, idx++) { next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) - next = vload4(wave_id, next_block); + prefetch(next_block, BLOCK_SIZE_ULONG); - tmp ^= ref; + tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); if (idx < seg_length - 1) { ref_idx = seg_refs[i + 1]; + prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG); if(idxs != 0) { keep = cur_idx & 0x80000000; @@ -881,8 +880,6 @@ __kernel void fill_blocks(__global ulong *chunk_0, } else cur_idx++; - - ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); } vstore4(tmp, id, state); @@ -893,7 +890,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, G4(state); if(with_xor == 1) - tmp ^= next; + tmp ^= vload4(wave_id, next_block); tmp ^= vload4(id, state); @@ -911,7 +908,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG; if(with_xor == 1) - next = vload4(wave_id, next_block); + prefetch(next_block, BLOCK_SIZE_ULONG); ulong pseudo_rand = state[0]; @@ -960,9 +957,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; } - ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); - - tmp ^= ref; + tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); vstore4(tmp, id, state); @@ -972,7 +967,7 @@ __kernel void fill_blocks(__global ulong *chunk_0, G4(state); if(with_xor == 1) - tmp ^= next; + tmp ^= vload4(wave_id, next_block); tmp ^= vload4(id, state);