OpenCL kernel optimization - use prefetch instruction for preloading.

This commit is contained in:
Haifa Bogdan Adnan 2019-09-08 08:40:57 +03:00
parent f4de892742
commit 0d9d687c3d

View file

@ -847,7 +847,6 @@ __kernel void fill_blocks(__global ulong *chunk_0,
int prev_idx = cur_seg[1]; int prev_idx = cur_seg[1];
int seg_type = cur_seg[2]; int seg_type = cur_seg[2];
int ref_idx = 0; int ref_idx = 0;
ulong4 ref = 0, next = 0;
prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG; prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG;
@ -856,24 +855,24 @@ __kernel void fill_blocks(__global ulong *chunk_0,
if(seg_type == 0) { if(seg_type == 0) {
seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
ref_idx = seg_refs[0]; ref_idx = seg_refs[0];
prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
if(idxs != 0) { if(idxs != 0) {
seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2); seg_idxs = idxs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
cur_idx = seg_idxs[0]; cur_idx = seg_idxs[0];
} }
ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
for (int i=0;idx < seg_length;i++, idx++) { for (int i=0;idx < seg_length;i++, idx++) {
next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG; next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG;
if(with_xor == 1) if(with_xor == 1)
next = vload4(wave_id, next_block); prefetch(next_block, BLOCK_SIZE_ULONG);
tmp ^= ref; tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
if (idx < seg_length - 1) { if (idx < seg_length - 1) {
ref_idx = seg_refs[i + 1]; ref_idx = seg_refs[i + 1];
prefetch(memory + ref_idx * 2 * BLOCK_SIZE_ULONG, BLOCK_SIZE_ULONG);
if(idxs != 0) { if(idxs != 0) {
keep = cur_idx & 0x80000000; keep = cur_idx & 0x80000000;
@ -881,8 +880,6 @@ __kernel void fill_blocks(__global ulong *chunk_0,
} }
else else
cur_idx++; cur_idx++;
ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
} }
vstore4(tmp, id, state); vstore4(tmp, id, state);
@ -893,7 +890,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
G4(state); G4(state);
if(with_xor == 1) if(with_xor == 1)
tmp ^= next; tmp ^= vload4(wave_id, next_block);
tmp ^= vload4(id, state); tmp ^= vload4(id, state);
@ -911,7 +908,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG; next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG;
if(with_xor == 1) if(with_xor == 1)
next = vload4(wave_id, next_block); prefetch(next_block, BLOCK_SIZE_ULONG);
ulong pseudo_rand = state[0]; ulong pseudo_rand = state[0];
@ -960,9 +957,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length; ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
} }
ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG); tmp ^= vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
tmp ^= ref;
vstore4(tmp, id, state); vstore4(tmp, id, state);
@ -972,7 +967,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
G4(state); G4(state);
if(with_xor == 1) if(with_xor == 1)
tmp ^= next; tmp ^= vload4(wave_id, next_block);
tmp ^= vload4(id, state); tmp ^= vload4(id, state);