AstroBWT OpenCL code

2020-03-22 22:36:21 +01:00 · 2020-03-22 22:36:21 +01:00 · fbedf197ab
commit fbedf197ab
parent 50eb7ba2fd
34 changed files with 2106 additions and 6 deletions
--- a/src/backend/opencl/cl/astrobwt/BWT.cl
+++ b/src/backend/opencl/cl/astrobwt/BWT.cl
@ -0,0 +1,199 @@
+/* XMRig
+ * Copyright 2010      Jeff Garzik <jgarzik@pobox.com>
+ * Copyright 2012-2014 pooler      <pooler@litecoinpool.org>
+ * Copyright 2014      Lucas Jones <https://github.com/lucasjones>
+ * Copyright 2014-2016 Wolf9466    <https://github.com/OhGodAPet>
+ * Copyright 2016      Jay D Dee   <jayddee246@gmail.com>
+ * Copyright 2017-2018 XMR-Stak    <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
+ * Copyright 2018-2020 SChernykh   <https://github.com/SChernykh>
+ * Copyright 2016-2020 XMRig       <https://github.com/xmrig>, <support@xmrig.com>
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#define STAGE1_SIZE 147253
+
+#define COUNTING_SORT_BITS 11
+#define COUNTING_SORT_SIZE (1 << COUNTING_SORT_BITS)
+#define FINAL_SORT_BATCH_SIZE COUNTING_SORT_SIZE
+#define FINAL_SORT_OVERLAP_SIZE 32
+
+__attribute__((reqd_work_group_size(BWT_GROUP_SIZE, 1, 1)))
+__kernel void BWT(__global uint8_t* datas, __global uint32_t* data_sizes, uint32_t data_stride, __global uint64_t* indices, __global uint64_t* tmp_indices)
+{
+	const uint32_t tid = get_local_id(0);
+	const uint32_t gid = get_group_id(0);
+
+	__local int counters[COUNTING_SORT_SIZE][2];
+
+	for (uint32_t i = tid; i < COUNTING_SORT_SIZE * 2; i += BWT_GROUP_SIZE)
+		((__local int*)counters)[i] = 0;
+
+	const uint64_t data_offset = (uint64_t)(gid) * data_stride;
+
+	__global uint8_t* input = datas + data_offset + 128;
+	const uint32_t N = data_sizes[gid] + 1;
+
+	__global uint64_t* p = (__global uint64_t*)(input);
+	volatile __local uint8_t* counters_atomic = (volatile __local uint8_t*)(counters);
+
+	indices += data_offset;
+	tmp_indices += data_offset;
+
+	for (uint32_t i = tid; i < N; i += BWT_GROUP_SIZE)
+	{
+		const uint32_t index = i >> 3;
+		const uint32_t bit_offset = (i & 7) << 3;
+
+		const uint64_t a = p[index];
+		uint64_t b = p[index + 1];
+		if (bit_offset == 0)
+			b = 0;
+
+		uint64_t value = (a >> bit_offset) | (b << (64 - bit_offset));
+
+		uint2 tmp;
+		const uchar4 mask = (uchar4)(3, 2, 1, 0);
+
+		tmp.x = as_uint(shuffle(as_uchar4(as_uint2(value).y), mask));
+		tmp.y = as_uint(shuffle(as_uchar4(as_uint2(value).x), mask));
+
+		value = as_ulong(tmp);
+
+		indices[i] = (value & ((uint64_t)(-1) << 21)) | i;
+		atomic_add((volatile __local int*)(counters_atomic + (((value >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
+		atomic_add((volatile __local int*)(counters_atomic + ((value >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
+	}
+
+	if (tid == 0)
+	{
+		int t0 = counters[0][0];
+		int t1 = counters[0][1];
+		counters[0][0] = t0 - 1;
+		counters[0][1] = t1 - 1;
+		for (uint32_t i = 1; i < COUNTING_SORT_SIZE; ++i)
+		{
+			t0 += counters[i][0];
+			t1 += counters[i][1];
+			counters[i][0] = t0 - 1;
+			counters[i][1] = t1 - 1;
+		}
+	}
+
+	for (int i = tid; i < N; i += BWT_GROUP_SIZE)
+	{
+		const uint64_t data = indices[i];
+		const int k = atomic_sub((volatile __local int*)(counters_atomic + (((data >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
+		tmp_indices[k] = data;
+	}
+
+	for (int i = N - 1 - tid; i >= 0; i -= BWT_GROUP_SIZE)
+	{
+		const uint64_t data = tmp_indices[i];
+		const int k = atomic_sub((volatile __local int*)(counters_atomic + ((data >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
+		indices[k] = data;
+	}
+
+	__local uint64_t* buf = (__local uint64_t*)(counters);
+	for (uint32_t i = 0; i < N; i += FINAL_SORT_BATCH_SIZE - FINAL_SORT_OVERLAP_SIZE)
+	{
+		const uint32_t len = (N - i < FINAL_SORT_BATCH_SIZE) ? (N - i) : FINAL_SORT_BATCH_SIZE;
+
+		for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
+			buf[j] = indices[i + j];
+
+		if (tid == 0)
+		{
+			uint64_t prev_t = buf[0];
+			for (int i = 1; i < len; ++i)
+			{
+				uint64_t t = buf[i];
+				if (t < prev_t)
+				{
+					const uint64_t t2 = prev_t;
+					int j = i - 1;
+					do
+					{
+						buf[j + 1] = prev_t;
+						--j;
+						if (j < 0)
+							break;
+						prev_t = buf[j];
+					} while (t < prev_t);
+					buf[j + 1] = t;
+					t = t2;
+				}
+				prev_t = t;
+			}
+		}
+
+		for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
+			indices[i + j] = buf[j];
+	}
+
+	--input;
+	__global uint8_t* output = (__global uint8_t*)(tmp_indices);
+	for (int i = tid; i <= N; i += BWT_GROUP_SIZE)
+		output[i] = input[indices[i] & ((1 << 21) - 1)];
+}
+
+__kernel void filter(uint32_t nonce, uint32_t bwt_max_size, __global const uint32_t* hashes, __global uint32_t* filtered_hashes)
+{
+	const uint32_t global_id = get_global_id(0);
+
+	__global const uint32_t* hash = hashes + global_id * (32 / sizeof(uint32_t));
+	const uint32_t stage2_size = STAGE1_SIZE + (*hash & 0xfffff);
+
+	if (stage2_size < bwt_max_size)
+	{
+		const int index = atomic_add((volatile __global int*)(filtered_hashes), 1) * (36 / sizeof(uint32_t)) + 1;
+
+		filtered_hashes[index] = nonce + global_id;
+
+		#pragma unroll(8)
+		for (uint32_t i = 0; i < 8; ++i)
+			filtered_hashes[index + i + 1] = hash[i];
+	}
+}
+
+__kernel void prepare_batch2(__global uint32_t* hashes, __global uint32_t* filtered_hashes, __global uint32_t* data_sizes)
+{
+	const uint32_t global_id = get_global_id(0);
+	const uint32_t N = filtered_hashes[0] - get_global_size(0);
+
+	if (global_id == 0)
+		filtered_hashes[0] = N;
+
+	__global uint32_t* hash = hashes + global_id * 8;
+	__global uint32_t* filtered_hash = filtered_hashes + (global_id + N) * 9 + 1;
+
+	const uint32_t stage2_size = STAGE1_SIZE + (filtered_hash[1] & 0xfffff);
+	data_sizes[global_id] = stage2_size;
+
+	#pragma unroll(8)
+	for (uint32_t i = 0; i < 8; ++i)
+		hash[i] = filtered_hash[i + 1];
+}
+
+__kernel void find_shares(__global const uint64_t* hashes, __global const uint32_t* filtered_hashes, uint64_t target, __global uint32_t* shares)
+{
+	const uint32_t global_index = get_global_id(0);
+
+	if (hashes[global_index * 4 + 3] < target)
+	{
+		const uint32_t idx = atomic_inc(shares + 0xFF);
+		if (idx < 0xFF)
+			shares[idx] = filtered_hashes[(filtered_hashes[0] + global_index) * 9 + 1];
+	}
+}