AstroBWT OpenCL code
This commit is contained in:
parent
50eb7ba2fd
commit
fbedf197ab
34 changed files with 2106 additions and 6 deletions
199
src/backend/opencl/cl/astrobwt/BWT.cl
Normal file
199
src/backend/opencl/cl/astrobwt/BWT.cl
Normal file
|
@ -0,0 +1,199 @@
|
|||
/* XMRig
|
||||
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
||||
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
||||
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
||||
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
||||
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||
*
|
||||
* This program is free software: you can redistribute it and/or modify
|
||||
* it under the terms of the GNU General Public License as published by
|
||||
* the Free Software Foundation, either version 3 of the License, or
|
||||
* (at your option) any later version.
|
||||
*
|
||||
* This program is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
* GNU General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU General Public License
|
||||
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
#define STAGE1_SIZE 147253
|
||||
|
||||
#define COUNTING_SORT_BITS 11
|
||||
#define COUNTING_SORT_SIZE (1 << COUNTING_SORT_BITS)
|
||||
#define FINAL_SORT_BATCH_SIZE COUNTING_SORT_SIZE
|
||||
#define FINAL_SORT_OVERLAP_SIZE 32
|
||||
|
||||
__attribute__((reqd_work_group_size(BWT_GROUP_SIZE, 1, 1)))
|
||||
__kernel void BWT(__global uint8_t* datas, __global uint32_t* data_sizes, uint32_t data_stride, __global uint64_t* indices, __global uint64_t* tmp_indices)
|
||||
{
|
||||
const uint32_t tid = get_local_id(0);
|
||||
const uint32_t gid = get_group_id(0);
|
||||
|
||||
__local int counters[COUNTING_SORT_SIZE][2];
|
||||
|
||||
for (uint32_t i = tid; i < COUNTING_SORT_SIZE * 2; i += BWT_GROUP_SIZE)
|
||||
((__local int*)counters)[i] = 0;
|
||||
|
||||
const uint64_t data_offset = (uint64_t)(gid) * data_stride;
|
||||
|
||||
__global uint8_t* input = datas + data_offset + 128;
|
||||
const uint32_t N = data_sizes[gid] + 1;
|
||||
|
||||
__global uint64_t* p = (__global uint64_t*)(input);
|
||||
volatile __local uint8_t* counters_atomic = (volatile __local uint8_t*)(counters);
|
||||
|
||||
indices += data_offset;
|
||||
tmp_indices += data_offset;
|
||||
|
||||
for (uint32_t i = tid; i < N; i += BWT_GROUP_SIZE)
|
||||
{
|
||||
const uint32_t index = i >> 3;
|
||||
const uint32_t bit_offset = (i & 7) << 3;
|
||||
|
||||
const uint64_t a = p[index];
|
||||
uint64_t b = p[index + 1];
|
||||
if (bit_offset == 0)
|
||||
b = 0;
|
||||
|
||||
uint64_t value = (a >> bit_offset) | (b << (64 - bit_offset));
|
||||
|
||||
uint2 tmp;
|
||||
const uchar4 mask = (uchar4)(3, 2, 1, 0);
|
||||
|
||||
tmp.x = as_uint(shuffle(as_uchar4(as_uint2(value).y), mask));
|
||||
tmp.y = as_uint(shuffle(as_uchar4(as_uint2(value).x), mask));
|
||||
|
||||
value = as_ulong(tmp);
|
||||
|
||||
indices[i] = (value & ((uint64_t)(-1) << 21)) | i;
|
||||
atomic_add((volatile __local int*)(counters_atomic + (((value >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
|
||||
atomic_add((volatile __local int*)(counters_atomic + ((value >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
|
||||
}
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
int t0 = counters[0][0];
|
||||
int t1 = counters[0][1];
|
||||
counters[0][0] = t0 - 1;
|
||||
counters[0][1] = t1 - 1;
|
||||
for (uint32_t i = 1; i < COUNTING_SORT_SIZE; ++i)
|
||||
{
|
||||
t0 += counters[i][0];
|
||||
t1 += counters[i][1];
|
||||
counters[i][0] = t0 - 1;
|
||||
counters[i][1] = t1 - 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = tid; i < N; i += BWT_GROUP_SIZE)
|
||||
{
|
||||
const uint64_t data = indices[i];
|
||||
const int k = atomic_sub((volatile __local int*)(counters_atomic + (((data >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
|
||||
tmp_indices[k] = data;
|
||||
}
|
||||
|
||||
for (int i = N - 1 - tid; i >= 0; i -= BWT_GROUP_SIZE)
|
||||
{
|
||||
const uint64_t data = tmp_indices[i];
|
||||
const int k = atomic_sub((volatile __local int*)(counters_atomic + ((data >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
|
||||
indices[k] = data;
|
||||
}
|
||||
|
||||
__local uint64_t* buf = (__local uint64_t*)(counters);
|
||||
for (uint32_t i = 0; i < N; i += FINAL_SORT_BATCH_SIZE - FINAL_SORT_OVERLAP_SIZE)
|
||||
{
|
||||
const uint32_t len = (N - i < FINAL_SORT_BATCH_SIZE) ? (N - i) : FINAL_SORT_BATCH_SIZE;
|
||||
|
||||
for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
|
||||
buf[j] = indices[i + j];
|
||||
|
||||
if (tid == 0)
|
||||
{
|
||||
uint64_t prev_t = buf[0];
|
||||
for (int i = 1; i < len; ++i)
|
||||
{
|
||||
uint64_t t = buf[i];
|
||||
if (t < prev_t)
|
||||
{
|
||||
const uint64_t t2 = prev_t;
|
||||
int j = i - 1;
|
||||
do
|
||||
{
|
||||
buf[j + 1] = prev_t;
|
||||
--j;
|
||||
if (j < 0)
|
||||
break;
|
||||
prev_t = buf[j];
|
||||
} while (t < prev_t);
|
||||
buf[j + 1] = t;
|
||||
t = t2;
|
||||
}
|
||||
prev_t = t;
|
||||
}
|
||||
}
|
||||
|
||||
for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
|
||||
indices[i + j] = buf[j];
|
||||
}
|
||||
|
||||
--input;
|
||||
__global uint8_t* output = (__global uint8_t*)(tmp_indices);
|
||||
for (int i = tid; i <= N; i += BWT_GROUP_SIZE)
|
||||
output[i] = input[indices[i] & ((1 << 21) - 1)];
|
||||
}
|
||||
|
||||
__kernel void filter(uint32_t nonce, uint32_t bwt_max_size, __global const uint32_t* hashes, __global uint32_t* filtered_hashes)
|
||||
{
|
||||
const uint32_t global_id = get_global_id(0);
|
||||
|
||||
__global const uint32_t* hash = hashes + global_id * (32 / sizeof(uint32_t));
|
||||
const uint32_t stage2_size = STAGE1_SIZE + (*hash & 0xfffff);
|
||||
|
||||
if (stage2_size < bwt_max_size)
|
||||
{
|
||||
const int index = atomic_add((volatile __global int*)(filtered_hashes), 1) * (36 / sizeof(uint32_t)) + 1;
|
||||
|
||||
filtered_hashes[index] = nonce + global_id;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (uint32_t i = 0; i < 8; ++i)
|
||||
filtered_hashes[index + i + 1] = hash[i];
|
||||
}
|
||||
}
|
||||
|
||||
__kernel void prepare_batch2(__global uint32_t* hashes, __global uint32_t* filtered_hashes, __global uint32_t* data_sizes)
|
||||
{
|
||||
const uint32_t global_id = get_global_id(0);
|
||||
const uint32_t N = filtered_hashes[0] - get_global_size(0);
|
||||
|
||||
if (global_id == 0)
|
||||
filtered_hashes[0] = N;
|
||||
|
||||
__global uint32_t* hash = hashes + global_id * 8;
|
||||
__global uint32_t* filtered_hash = filtered_hashes + (global_id + N) * 9 + 1;
|
||||
|
||||
const uint32_t stage2_size = STAGE1_SIZE + (filtered_hash[1] & 0xfffff);
|
||||
data_sizes[global_id] = stage2_size;
|
||||
|
||||
#pragma unroll(8)
|
||||
for (uint32_t i = 0; i < 8; ++i)
|
||||
hash[i] = filtered_hash[i + 1];
|
||||
}
|
||||
|
||||
__kernel void find_shares(__global const uint64_t* hashes, __global const uint32_t* filtered_hashes, uint64_t target, __global uint32_t* shares)
|
||||
{
|
||||
const uint32_t global_index = get_global_id(0);
|
||||
|
||||
if (hashes[global_index * 4 + 3] < target)
|
||||
{
|
||||
const uint32_t idx = atomic_inc(shares + 0xFF);
|
||||
if (idx < 0xFF)
|
||||
shares[idx] = filtered_hashes[(filtered_hashes[0] + global_index) * 9 + 1];
|
||||
}
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue