diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp
index af682fbe..d31bf69b 100755
--- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp
+++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLHasher.cpp
@@ -760,9 +760,9 @@ void *opencl_kernel_filler(int threads, Argon2Profile *profile, void *user_data)
     cl_int error;
 
 	size_t total_work_items = threads * KERNEL_WORKGROUP_SIZE * profile->thrCost;
-	size_t local_work_items = KERNEL_WORKGROUP_SIZE * profile->thrCost;
+	size_t local_work_items = 2 * KERNEL_WORKGROUP_SIZE * profile->thrCost;
 
-    size_t shared_mem = profile->thrCost * ARGON2_QWORDS_IN_BLOCK;
+    size_t shared_mem = 2 * profile->thrCost * ARGON2_QWORDS_IN_BLOCK;
 
     clSetKernelArg(device->kernelFillBlocks, 6, sizeof(device->arguments.seedMemory[gpumgmt_thread->threadId]), &device->arguments.seedMemory[gpumgmt_thread->threadId]);
     clSetKernelArg(device->kernelFillBlocks, 7, sizeof(device->arguments.outMemory[gpumgmt_thread->threadId]), &device->arguments.outMemory[gpumgmt_thread->threadId]);
diff --git a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp
index f360189f..fad3b886 100644
--- a/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp
+++ b/src/crypto/argon2_hasher/hash/gpu/opencl/OpenCLKernel.cpp
@@ -771,13 +771,17 @@ __kernel void fill_blocks(__global ulong *chunk_0,
     ulong4 tmp;
 	ulong a, b, c, d;
 
-	int hash = get_group_id(0);
+	int hash_base = get_group_id(0) * 2;
 	int local_id = get_local_id(0);
 
-	int id = local_id % THREADS_PER_LANE;
-	int lane = local_id / THREADS_PER_LANE;
+    int hash_idx = (local_id / THREADS_PER_LANE) % 2;
+    int wave_id = local_id % (THREADS_PER_LANE * 2);
+	int id = wave_id % THREADS_PER_LANE;
+	int lane = local_id / (THREADS_PER_LANE * 2);
 	int lane_length = seg_length * 4;
 
+    int hash = hash_base + hash_idx;
+
 	ulong chunks[6];
 	chunks[0] = (ulong)chunk_0;
 	chunks[1] = (ulong)chunk_1;
@@ -785,8 +789,8 @@ __kernel void fill_blocks(__global ulong *chunk_0,
 	chunks[3] = (ulong)chunk_3;
 	chunks[4] = (ulong)chunk_4;
 	chunks[5] = (ulong)chunk_5;
-	int chunk_index = hash / threads_per_chunk;
-	int chunk_offset = hash - chunk_index * threads_per_chunk;
+	int chunk_index = hash_base / threads_per_chunk;
+	int chunk_offset = hash_base - chunk_index * threads_per_chunk;
 	__global ulong *memory = (__global ulong *)chunks[chunk_index] + chunk_offset * (memsize / 8);
 
 	int i1_0 = offsets_round_1[id][0];
@@ -809,15 +813,13 @@ __kernel void fill_blocks(__global ulong *chunk_0,
 	int i4_2 = offsets_round_4[id][2];
 	int i4_3 = offsets_round_4[id][3];
 
-	__global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG;
 	__global ulong *seed_mem = seed + hash * lanes * 2 * BLOCK_SIZE_ULONG + lane * 2 * BLOCK_SIZE_ULONG;
-
-	__global ulong *seed_dst = memory + lane * lane_length * BLOCK_SIZE_ULONG;
+	__global ulong *seed_dst = memory + (lane * lane_length * 2 + hash_idx) * BLOCK_SIZE_ULONG;
 
 	vstore4(vload4(id, seed_mem), id, seed_dst);
 
 	seed_mem += BLOCK_SIZE_ULONG;
-	seed_dst += BLOCK_SIZE_ULONG;
+	seed_dst += (2 * BLOCK_SIZE_ULONG);
 
 	vstore4(vload4(id, seed_mem), id, seed_dst);
 
@@ -826,7 +828,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
     __global uint *seg_refs;
     __global uint *seg_idxs;
 
-	__local ulong *state = scratchpad + lane * BLOCK_SIZE_ULONG;
+	__local ulong *state = scratchpad + (lane * 2 + hash_idx) * BLOCK_SIZE_ULONG;
 
 	segments += (lane * 3);
 
@@ -845,9 +847,9 @@ __kernel void fill_blocks(__global ulong *chunk_0,
         int ref_idx = 0;
         ulong4 ref = 0, next = 0;
 
-		prev_block = memory + prev_idx * BLOCK_SIZE_ULONG;
+		prev_block = memory + prev_idx * 2 * BLOCK_SIZE_ULONG;
 
-		tmp = vload4(id, prev_block);
+		tmp = vload4(wave_id, prev_block);
 
         if(seg_type == 0) {
             seg_refs = refs + ((s * lanes + lane) * seg_length - ((s > 0) ? lanes : lane) * 2);
@@ -858,13 +860,13 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                 cur_idx = seg_idxs[0];
             }
 
-            ulong4 nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
+            ulong4 nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
 
             for (int i=0;idx < seg_length;i++, idx++) {
-    			next_block = memory + (cur_idx & 0x7FFFFFFF) * BLOCK_SIZE_ULONG;
+    			next_block = memory + (cur_idx & 0x7FFFFFFF) * 2 * BLOCK_SIZE_ULONG;
 
                 if(with_xor == 1)
-                    next = vload4(id, next_block);
+                    next = vload4(wave_id, next_block);
 
                 ref = nextref;
 
@@ -878,7 +880,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                     else
                         cur_idx++;
 
-                    nextref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
+                    nextref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
                 }
 
                 tmp ^= ref;
@@ -896,7 +898,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                 tmp ^= vload4(id, state);
 
                 if(keep > 0) {
-                    vstore4(tmp, id, next_block);
+                    vstore4(tmp, wave_id, next_block);
                     barrier(CLK_GLOBAL_MEM_FENCE);
                 }
             }
@@ -953,12 +955,12 @@ __kernel void fill_blocks(__global ulong *chunk_0,
     				ref_idx = ref_lane * lane_length + (((pass > 0 && slice < 3) ? ((slice + 1) * seg_length) : 0) + relative_position) % lane_length;
                 }
 
-        		ref = vload4(id, memory + ref_idx * BLOCK_SIZE_ULONG);
+        		ref = vload4(wave_id, memory + ref_idx * 2 * BLOCK_SIZE_ULONG);
 
-    			next_block = memory + cur_idx * BLOCK_SIZE_ULONG;
+    			next_block = memory + cur_idx * 2 * BLOCK_SIZE_ULONG;
 
                 if(with_xor == 1)
-                    next = vload4(id, next_block);
+                    next = vload4(wave_id, next_block);
 
                 tmp ^= ref;
 
@@ -975,7 +977,7 @@ __kernel void fill_blocks(__global ulong *chunk_0,
                 tmp ^= vload4(id, state);
 
                 vstore4(tmp, id, state);
-                vstore4(tmp, id, next_block);
+                vstore4(tmp, wave_id, next_block);
                 barrier(CLK_GLOBAL_MEM_FENCE | CLK_LOCAL_MEM_FENCE);
             }
         }
@@ -985,8 +987,9 @@ __kernel void fill_blocks(__global ulong *chunk_0,
     barrier(CLK_LOCAL_MEM_FENCE);
 
 	if(lane == 0) { // first lane needs to acumulate results
+    	__global ulong *out_mem = out + hash * BLOCK_SIZE_ULONG;
 		for(int l=1; l<lanes; l++)
-            tmp ^= vload4(id, scratchpad + l * BLOCK_SIZE_ULONG);
+            tmp ^= vload4(id, scratchpad + (l * 2 + hash_idx) * BLOCK_SIZE_ULONG);
 
         vstore4(tmp, id, out_mem);
 	}