201 lines
6.4 KiB
Common Lisp
201 lines
6.4 KiB
Common Lisp
/* XMRig
|
|
* Copyright 2010 Jeff Garzik <jgarzik@pobox.com>
|
|
* Copyright 2012-2014 pooler <pooler@litecoinpool.org>
|
|
* Copyright 2014 Lucas Jones <https://github.com/lucasjones>
|
|
* Copyright 2014-2016 Wolf9466 <https://github.com/OhGodAPet>
|
|
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
|
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
|
* Copyright 2018-2020 SChernykh <https://github.com/SChernykh>
|
|
* Copyright 2016-2020 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|
*
|
|
* This program is free software: you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation, either version 3 of the License, or
|
|
* (at your option) any later version.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#define STAGE1_SIZE 147253
|
|
|
|
#define COUNTING_SORT_BITS 11
|
|
#define COUNTING_SORT_SIZE (1 << COUNTING_SORT_BITS)
|
|
#define FINAL_SORT_BATCH_SIZE COUNTING_SORT_SIZE
|
|
#define FINAL_SORT_OVERLAP_SIZE 32
|
|
|
|
__attribute__((reqd_work_group_size(BWT_GROUP_SIZE, 1, 1)))
|
|
__kernel void BWT(__global uint8_t* datas, __global uint32_t* data_sizes, uint32_t data_stride, __global uint64_t* indices, __global uint64_t* tmp_indices)
|
|
{
|
|
const uint32_t tid = get_local_id(0);
|
|
const uint32_t gid = get_group_id(0);
|
|
|
|
__local int counters[COUNTING_SORT_SIZE][2];
|
|
|
|
for (uint32_t i = tid; i < COUNTING_SORT_SIZE * 2; i += BWT_GROUP_SIZE)
|
|
((__local int*)counters)[i] = 0;
|
|
|
|
const uint64_t data_offset = (uint64_t)(gid) * data_stride;
|
|
|
|
__global uint8_t* input = datas + data_offset + 128;
|
|
const uint32_t N = data_sizes[gid] + 1;
|
|
|
|
__global uint64_t* p = (__global uint64_t*)(input);
|
|
volatile __local uint8_t* counters_atomic = (volatile __local uint8_t*)(counters);
|
|
|
|
indices += data_offset;
|
|
tmp_indices += data_offset;
|
|
|
|
for (uint32_t i = tid; i < N; i += BWT_GROUP_SIZE)
|
|
{
|
|
const uint32_t index = i >> 3;
|
|
const uint32_t bit_offset = (i & 7) << 3;
|
|
|
|
const uint64_t a = p[index];
|
|
uint64_t b = p[index + 1];
|
|
if (bit_offset == 0)
|
|
b = 0;
|
|
|
|
uint64_t value = (a >> bit_offset) | (b << (64 - bit_offset));
|
|
|
|
uint2 tmp;
|
|
const uchar4 mask = (uchar4)(3, 2, 1, 0);
|
|
|
|
tmp.x = as_uint(shuffle(as_uchar4(as_uint2(value).y), mask));
|
|
tmp.y = as_uint(shuffle(as_uchar4(as_uint2(value).x), mask));
|
|
|
|
value = as_ulong(tmp);
|
|
|
|
indices[i] = (value & ((uint64_t)(-1) << 21)) | i;
|
|
atomic_add((volatile __local int*)(counters_atomic + (((value >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
|
|
atomic_add((volatile __local int*)(counters_atomic + ((value >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
|
|
}
|
|
|
|
if (tid == 0)
|
|
{
|
|
int t0 = counters[0][0];
|
|
int t1 = counters[0][1];
|
|
counters[0][0] = t0 - 1;
|
|
counters[0][1] = t1 - 1;
|
|
for (uint32_t i = 1; i < COUNTING_SORT_SIZE; ++i)
|
|
{
|
|
t0 += counters[i][0];
|
|
t1 += counters[i][1];
|
|
counters[i][0] = t0 - 1;
|
|
counters[i][1] = t1 - 1;
|
|
}
|
|
}
|
|
|
|
for (int i = tid; i < N; i += BWT_GROUP_SIZE)
|
|
{
|
|
const uint64_t data = indices[i];
|
|
const int k = atomic_sub((volatile __local int*)(counters_atomic + (((data >> (64 - COUNTING_SORT_BITS * 2)) & (COUNTING_SORT_SIZE - 1)) << 3)), 1);
|
|
tmp_indices[k] = data;
|
|
}
|
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
|
|
for (int i = N - 1 - tid; i >= 0; i -= BWT_GROUP_SIZE)
|
|
{
|
|
const uint64_t data = tmp_indices[i];
|
|
const int k = atomic_sub((volatile __local int*)(counters_atomic + ((data >> (64 - COUNTING_SORT_BITS)) << 3) + 4), 1);
|
|
indices[k] = data;
|
|
}
|
|
barrier(CLK_GLOBAL_MEM_FENCE);
|
|
|
|
__local uint64_t* buf = (__local uint64_t*)(counters);
|
|
for (uint32_t i = 0; i < N; i += FINAL_SORT_BATCH_SIZE - FINAL_SORT_OVERLAP_SIZE)
|
|
{
|
|
const uint32_t len = (N - i < FINAL_SORT_BATCH_SIZE) ? (N - i) : FINAL_SORT_BATCH_SIZE;
|
|
|
|
for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
|
|
buf[j] = indices[i + j];
|
|
|
|
if (tid == 0)
|
|
{
|
|
uint64_t prev_t = buf[0];
|
|
for (int i = 1; i < len; ++i)
|
|
{
|
|
uint64_t t = buf[i];
|
|
if (t < prev_t)
|
|
{
|
|
const uint64_t t2 = prev_t;
|
|
int j = i - 1;
|
|
do
|
|
{
|
|
buf[j + 1] = prev_t;
|
|
--j;
|
|
if (j < 0)
|
|
break;
|
|
prev_t = buf[j];
|
|
} while (t < prev_t);
|
|
buf[j + 1] = t;
|
|
t = t2;
|
|
}
|
|
prev_t = t;
|
|
}
|
|
}
|
|
|
|
for (uint32_t j = tid; j < len; j += BWT_GROUP_SIZE)
|
|
indices[i + j] = buf[j];
|
|
}
|
|
|
|
--input;
|
|
__global uint8_t* output = (__global uint8_t*)(tmp_indices);
|
|
for (int i = tid; i <= N; i += BWT_GROUP_SIZE)
|
|
output[i] = input[indices[i] & ((1 << 21) - 1)];
|
|
}
|
|
|
|
__kernel void filter(uint32_t nonce, uint32_t bwt_max_size, __global const uint32_t* hashes, __global uint32_t* filtered_hashes)
|
|
{
|
|
const uint32_t global_id = get_global_id(0);
|
|
|
|
__global const uint32_t* hash = hashes + global_id * (32 / sizeof(uint32_t));
|
|
const uint32_t stage2_size = STAGE1_SIZE + (*hash & 0xfffff);
|
|
|
|
if (stage2_size < bwt_max_size)
|
|
{
|
|
const int index = atomic_add((volatile __global int*)(filtered_hashes), 1) * (36 / sizeof(uint32_t)) + 1;
|
|
|
|
filtered_hashes[index] = nonce + global_id;
|
|
|
|
#pragma unroll(8)
|
|
for (uint32_t i = 0; i < 8; ++i)
|
|
filtered_hashes[index + i + 1] = hash[i];
|
|
}
|
|
}
|
|
|
|
__kernel void prepare_batch2(__global uint32_t* hashes, __global uint32_t* filtered_hashes, __global uint32_t* data_sizes)
|
|
{
|
|
const uint32_t global_id = get_global_id(0);
|
|
const uint32_t N = filtered_hashes[0] - get_global_size(0);
|
|
|
|
if (global_id == 0)
|
|
filtered_hashes[0] = N;
|
|
|
|
__global uint32_t* hash = hashes + global_id * 8;
|
|
__global uint32_t* filtered_hash = filtered_hashes + (global_id + N) * 9 + 1;
|
|
|
|
const uint32_t stage2_size = STAGE1_SIZE + (filtered_hash[1] & 0xfffff);
|
|
data_sizes[global_id] = stage2_size;
|
|
|
|
#pragma unroll(8)
|
|
for (uint32_t i = 0; i < 8; ++i)
|
|
hash[i] = filtered_hash[i + 1];
|
|
}
|
|
|
|
__kernel void find_shares(__global const uint64_t* hashes, __global const uint32_t* filtered_hashes, uint64_t target, __global uint32_t* shares)
|
|
{
|
|
const uint32_t global_index = get_global_id(0);
|
|
|
|
if (hashes[global_index * 4 + 3] < target)
|
|
{
|
|
const uint32_t idx = atomic_inc(shares + 0xFF);
|
|
if (idx < 0xFF)
|
|
shares[idx] = filtered_hashes[(filtered_hashes[0] + global_index) * 9 + 1];
|
|
}
|
|
}
|