REDACTED-rig/src/backend/opencl/cl/astrobwt/BWT.cl
2020-03-24 15:55:54 +01:00

201 lines
6.4 KiB
Common Lisp

/* XMRig
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
#define STAGE1_SIZE 147253
#define COUNTING_SORT_BITS 11
#define COUNTING_SORT_SIZE (1 << COUNTING_SORT_BITS)
#define FINAL_SORT_BATCH_SIZE COUNTING_SORT_SIZE
#define FINAL_SORT_OVERLAP_SIZE 32
__attribute__((reqd_work_group_size(BWT_GROUP_SIZE, 1, 1)))
__kernel void BWT(__global uint8_t* datas, __global uint32_t* data_sizes, uint32_t data_stride, __global uint64_t* indices, __global uint64_t* tmp_indices)
{
const uint32_t tid = get_local_id(0);
const uint32_t gid = get_group_id(0);
__local int counters[COUNTING_SORT_SIZE][2];
for (uint32_t i = tid; i < COUNTING_SORT_SIZE * 2; i += BWT_GROUP_SIZE)
((__local int*)counters)[i] = 0;
const uint64_t data_offset = (uint64_t)(gid) * data_stride;
__global uint8_t* input = datas + data_offset + 128;
const uint32_t N = data_sizes[gid] + 1;
__global uint64_t* p = (__global uint64_t*)(input);
volatile __local uint8_t* counters_atomic = (volatile __local uint8_t*)(counters);
indices += data_offset;
tmp_indices += data_offset;
for (uint32_t i = tid; i < N; i += BWT_GROUP_SIZE)
{
const uint32_t index = i >> 3;
const uint32_t bit_offset = (i & 7) << 3;
const uint64_t a = p[index];
uint64_t b = p[index + 1];
if (bit_offset == 0)
b = 0;
uint64_t value = (a >> bit_offset) | (b << (64 - bit_offset));
uint2 tmp;
const uchar4 mask = (uchar4)(3, 2, 1, 0);
tmp.x = as_uint(shuffle(as_uchar4(as_uint2(value).y), mask));
tmp.y = as_uint(shuffle(as_uchar4(as_uint2(value).x), mask));
value = as_ulong(tmp);
indices[i] = (value & ((uint64_t)(-1) << 21)) | i;
atomic_add((volatile __local int*)(counters_atomic + (((value >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
atomic_add((volatile __local int*)(counters_atomic + ((value >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
}
if (tid == 0)
{
int t0 = counters[0][0];
int t1 = counters[0][1];
counters[0][0] = t0 - 1;
counters[0][1] = t1 - 1;
for (uint32_t i = 1; i < COUNTING_SORT_SIZE; ++i)
{
t0 += counters[i][0];
t1 += counters[i][1];
counters[i][0] = t0 - 1;
counters[i][1] = t1 - 1;
}
}
for (int i = tid; i < N; i += BWT_GROUP_SIZE)
{
const uint64_t data = indices[i];
const int k = atomic_sub((volatile __local int*)(counters_atomic + (((data >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
tmp_indices[k] = data;
}
barrier(CLK_GLOBAL_MEM_FENCE);
for (int i = N - 1 - tid; i >= 0; i -= BWT_GROUP_SIZE)
{
const uint64_t data = tmp_indices[i];
const int k = atomic_sub((volatile __local int*)(counters_atomic + ((data >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
indices[k] = data;
}
barrier(CLK_GLOBAL_MEM_FENCE);
__local uint64_t* buf = (__local uint64_t*)(counters);
for (uint32_t i = 0; i < N; i += FINAL_SORT_BATCH_SIZE - FINAL_SORT_OVERLAP_SIZE)
{
const uint32_t len = (N - i < FINAL_SORT_BATCH_SIZE) ? (N - i) : FINAL_SORT_BATCH_SIZE;
for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
buf[j] = indices[i + j];
if (tid == 0)
{
uint64_t prev_t = buf[0];
for (int i = 1; i < len; ++i)
{
uint64_t t = buf[i];
if (t < prev_t)
{
const uint64_t t2 = prev_t;
int j = i - 1;
do
{
buf[j + 1] = prev_t;
--j;
if (j < 0)
break;
prev_t = buf[j];
} while (t < prev_t);
buf[j + 1] = t;
t = t2;
}
prev_t = t;
}
}
for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
indices[i + j] = buf[j];
}
--input;
__global uint8_t* output = (__global uint8_t*)(tmp_indices);
for (int i = tid; i <= N; i += BWT_GROUP_SIZE)
output[i] = input[indices[i] & ((1 << 21) - 1)];
}
__kernel void filter(uint32_t nonce, uint32_t bwt_max_size, __global const uint32_t* hashes, __global uint32_t* filtered_hashes)
{
const uint32_t global_id = get_global_id(0);
__global const uint32_t* hash = hashes + global_id * (32 / sizeof(uint32_t));
const uint32_t stage2_size = STAGE1_SIZE + (*hash & 0xfffff);
if (stage2_size < bwt_max_size)
{
const int index = atomic_add((volatile __global int*)(filtered_hashes), 1) * (36 / sizeof(uint32_t)) + 1;
filtered_hashes[index] = nonce + global_id;
#pragma unroll(8)
for (uint32_t i = 0; i < 8; ++i)
filtered_hashes[index + i + 1] = hash[i];
}
}
__kernel void prepare_batch2(__global uint32_t* hashes, __global uint32_t* filtered_hashes, __global uint32_t* data_sizes)
{
const uint32_t global_id = get_global_id(0);
const uint32_t N = filtered_hashes[0] - get_global_size(0);
if (global_id == 0)
filtered_hashes[0] = N;
__global uint32_t* hash = hashes + global_id * 8;
__global uint32_t* filtered_hash = filtered_hashes + (global_id + N) * 9 + 1;
const uint32_t stage2_size = STAGE1_SIZE + (filtered_hash[1] & 0xfffff);
data_sizes[global_id] = stage2_size;
#pragma unroll(8)
for (uint32_t i = 0; i < 8; ++i)
hash[i] = filtered_hash[i + 1];
}
__kernel void find_shares(__global const uint64_t* hashes, __global const uint32_t* filtered_hashes, uint64_t target, __global uint32_t* shares)
{
const uint32_t global_index = get_global_id(0);
if (hashes[global_index * 4 + 3] < target)
{
const uint32_t idx = atomic_inc(shares + 0xFF);
if (idx < 0xFF)
shares[idx] = filtered_hashes[(filtered_hashes[0] + global_index) * 9 + 1];
}
}