Conversion to NinjaRig.
This commit is contained in:
parent
84f56f0a4e
commit
2845347881
280 changed files with 18971 additions and 32469 deletions
353
src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu
Normal file
353
src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu
Normal file
|
@ -0,0 +1,353 @@
|
|||
#define BLOCK_BYTES 32
|
||||
#define OUT_BYTES 16
|
||||
#define BLAKE_SHARED_MEM 480
|
||||
#define BLAKE_SHARED_MEM_UINT 120
|
||||
|
||||
#define G(m, r, i, a, b, c, d) \
|
||||
do { \
|
||||
a = a + b + m[blake2b_sigma[r][2 * i + 0]]; \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = a + b + m[blake2b_sigma[r][2 * i + 1]]; \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define G_S(m, a, b, c, d) \
|
||||
do { \
|
||||
a = a + b + m; \
|
||||
d = rotr64(d ^ a, 32); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 24); \
|
||||
a = a + b + m; \
|
||||
d = rotr64(d ^ a, 16); \
|
||||
c = c + d; \
|
||||
b = rotr64(b ^ c, 63); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define ROUND(m, t, r) \
|
||||
do { \
|
||||
G(m, r, t, v0, v1, v2, v3); \
|
||||
v1 = __shfl_sync(0xFFFFFFFF, v1, t + 1, 4); \
|
||||
v2 = __shfl_sync(0xFFFFFFFF, v2, t + 2, 4); \
|
||||
v3 = __shfl_sync(0xFFFFFFFF, v3, t + 3, 4); \
|
||||
G(m, r, (t + 4), v0, v1, v2, v3); \
|
||||
v1 = __shfl_sync(0xFFFFFFFF, v1, t + 3, 4); \
|
||||
v2 = __shfl_sync(0xFFFFFFFF, v2, t + 2, 4); \
|
||||
v3 = __shfl_sync(0xFFFFFFFF, v3, t + 1, 4); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
#define ROUND_S(m, t) \
|
||||
do { \
|
||||
G_S(m, v0, v1, v2, v3); \
|
||||
v1 = __shfl_sync(0xFFFFFFFF, v1, t + 1, 4); \
|
||||
v2 = __shfl_sync(0xFFFFFFFF, v2, t + 2, 4); \
|
||||
v3 = __shfl_sync(0xFFFFFFFF, v3, t + 3, 4); \
|
||||
G_S(m, v0, v1, v2, v3); \
|
||||
v1 = __shfl_sync(0xFFFFFFFF, v1, t + 3, 4); \
|
||||
v2 = __shfl_sync(0xFFFFFFFF, v2, t + 2, 4); \
|
||||
v3 = __shfl_sync(0xFFFFFFFF, v3, t + 1, 4); \
|
||||
} while ((void)0, 0)
|
||||
|
||||
__constant__ uint64_t blake2b_IV[8] = {
|
||||
0x6A09E667F3BCC908, 0xBB67AE8584CAA73B,
|
||||
0x3C6EF372FE94F82B, 0xA54FF53A5F1D36F1,
|
||||
0x510E527FADE682D1, 0x9B05688C2B3E6C1F,
|
||||
0x1F83D9ABFB41BD6B, 0x5BE0CD19137E2179
|
||||
};
|
||||
|
||||
__constant__ uint32_t blake2b_sigma[12][16] = {
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
{11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4},
|
||||
{7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8},
|
||||
{9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13},
|
||||
{2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9},
|
||||
{12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11},
|
||||
{13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10},
|
||||
{6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5},
|
||||
{10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0},
|
||||
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15},
|
||||
{14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3},
|
||||
};
|
||||
|
||||
__device__ uint64_t rotr64(uint64_t x, uint32_t n)
|
||||
{
|
||||
return (x >> n) | (x << (64 - n));
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void blake2b_compress(uint64_t *h, uint64_t *m, uint64_t f0, int thr_id)
|
||||
{
|
||||
uint64_t v0, v1, v2, v3;
|
||||
|
||||
v0 = h[thr_id];
|
||||
v1 = h[thr_id + 4];
|
||||
v2 = blake2b_IV[thr_id];
|
||||
v3 = blake2b_IV[thr_id + 4];
|
||||
|
||||
if(thr_id == 0) v3 ^= h[8];
|
||||
if(thr_id == 1) v3 ^= h[9];
|
||||
if(thr_id == 2) v3 ^= f0;
|
||||
|
||||
ROUND(m, thr_id, 0);
|
||||
ROUND(m, thr_id, 1);
|
||||
ROUND(m, thr_id, 2);
|
||||
ROUND(m, thr_id, 3);
|
||||
ROUND(m, thr_id, 4);
|
||||
ROUND(m, thr_id, 5);
|
||||
ROUND(m, thr_id, 6);
|
||||
ROUND(m, thr_id, 7);
|
||||
ROUND(m, thr_id, 8);
|
||||
ROUND(m, thr_id, 9);
|
||||
ROUND(m, thr_id, 10);
|
||||
ROUND(m, thr_id, 11);
|
||||
|
||||
h[thr_id] ^= v0 ^ v2;
|
||||
h[thr_id + 4] ^= v1 ^ v3;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void blake2b_compress_static(uint64_t *h, uint64_t m, uint64_t f0, int thr_id)
|
||||
{
|
||||
uint64_t v0, v1, v2, v3;
|
||||
|
||||
v0 = h[thr_id];
|
||||
v1 = h[thr_id + 4];
|
||||
v2 = blake2b_IV[thr_id];
|
||||
v3 = blake2b_IV[thr_id + 4];
|
||||
|
||||
if(thr_id == 0) v3 ^= h[8];
|
||||
if(thr_id == 1) v3 ^= h[9];
|
||||
if(thr_id == 2) v3 ^= f0;
|
||||
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
ROUND_S(m, thr_id);
|
||||
|
||||
h[thr_id] ^= v0 ^ v2;
|
||||
h[thr_id + 4] ^= v1 ^ v3;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int blake2b_init(uint64_t *h, int out_len, int thr_id)
|
||||
{
|
||||
h[thr_id * 2] = blake2b_IV[thr_id * 2];
|
||||
h[thr_id * 2 + 1] = blake2b_IV[thr_id * 2 + 1];
|
||||
|
||||
if(thr_id == 0) {
|
||||
h[8] = h[9] = 0;
|
||||
h[0] = 0x6A09E667F3BCC908 ^ ((out_len * 4) | (1 << 16) | (1 << 24));
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void blake2b_incrementCounter(uint64_t *h, int inc)
|
||||
{
|
||||
h[8] += (inc * 4);
|
||||
h[9] += (h[8] < (inc * 4));
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int blake2b_update(uint32_t *in, int in_len, uint64_t *h, uint32_t *buf, int buf_len, int thr_id)
|
||||
{
|
||||
uint32_t *cursor_in = in;
|
||||
uint32_t *cursor_out = buf + buf_len;
|
||||
|
||||
if (buf_len + in_len > BLOCK_BYTES) {
|
||||
int left = BLOCK_BYTES - buf_len;
|
||||
|
||||
for(int i=0; i < (left >> 2); i++, cursor_in += 4, cursor_out += 4) {
|
||||
cursor_out[thr_id] = cursor_in[thr_id];
|
||||
}
|
||||
|
||||
if(thr_id == 0) {
|
||||
for (int i = 0; i < (left % 4); i++) {
|
||||
cursor_out[i] = cursor_in[i];
|
||||
}
|
||||
blake2b_incrementCounter(h, BLOCK_BYTES);
|
||||
}
|
||||
|
||||
blake2b_compress(h, (uint64_t*)buf, 0, thr_id);
|
||||
|
||||
buf_len = 0;
|
||||
|
||||
in_len -= left;
|
||||
in += left;
|
||||
|
||||
while (in_len > BLOCK_BYTES) {
|
||||
if(thr_id == 0)
|
||||
blake2b_incrementCounter(h, BLOCK_BYTES);
|
||||
|
||||
cursor_in = in;
|
||||
cursor_out = buf;
|
||||
|
||||
for(int i=0; i < (BLOCK_BYTES / 4); i++, cursor_in += 4, cursor_out += 4) {
|
||||
cursor_out[thr_id] = cursor_in[thr_id];
|
||||
}
|
||||
|
||||
blake2b_compress(h, (uint64_t *)buf, 0, thr_id);
|
||||
|
||||
in_len -= BLOCK_BYTES;
|
||||
in += BLOCK_BYTES;
|
||||
}
|
||||
}
|
||||
|
||||
cursor_in = in;
|
||||
cursor_out = buf + buf_len;
|
||||
|
||||
for(int i=0; i < (in_len >> 2); i++, cursor_in += 4, cursor_out += 4) {
|
||||
cursor_out[thr_id] = cursor_in[thr_id];
|
||||
}
|
||||
|
||||
if(thr_id == 0) {
|
||||
for (int i = 0; i < (in_len % 4); i++) {
|
||||
cursor_out[i] = cursor_in[i];
|
||||
}
|
||||
}
|
||||
|
||||
return buf_len + in_len;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ int blake2b_update_static(uint32_t in, int in_len, uint64_t *h, uint32_t *buf, int buf_len, int thr_id)
|
||||
{
|
||||
uint64_t in64 = in;
|
||||
in64 = in64 << 32;
|
||||
in64 = in64 | in;
|
||||
|
||||
uint32_t *cursor_out = buf + buf_len;
|
||||
|
||||
if (buf_len + in_len > BLOCK_BYTES) {
|
||||
int left = BLOCK_BYTES - buf_len;
|
||||
|
||||
for(int i=0; i < (left >> 2); i++, cursor_out += 4) {
|
||||
cursor_out[thr_id] = in;
|
||||
}
|
||||
|
||||
if(thr_id == 0) {
|
||||
for (int i = 0; i < (left % 4); i++) {
|
||||
cursor_out[i] = in;
|
||||
}
|
||||
blake2b_incrementCounter(h, BLOCK_BYTES);
|
||||
}
|
||||
|
||||
blake2b_compress(h, (uint64_t*)buf, 0, thr_id);
|
||||
|
||||
buf_len = 0;
|
||||
|
||||
in_len -= left;
|
||||
|
||||
while (in_len > BLOCK_BYTES) {
|
||||
if(thr_id == 0)
|
||||
blake2b_incrementCounter(h, BLOCK_BYTES);
|
||||
|
||||
blake2b_compress_static(h, in64, 0, thr_id);
|
||||
|
||||
in_len -= BLOCK_BYTES;
|
||||
}
|
||||
}
|
||||
|
||||
cursor_out = buf + buf_len;
|
||||
|
||||
for(int i=0; i < (in_len >> 2); i++, cursor_out += 4) {
|
||||
cursor_out[thr_id] = in;
|
||||
}
|
||||
|
||||
if(thr_id == 0) {
|
||||
for (int i = 0; i < (in_len % 4); i++) {
|
||||
cursor_out[i] = in;
|
||||
}
|
||||
}
|
||||
|
||||
return buf_len + in_len;
|
||||
}
|
||||
|
||||
__device__ __forceinline__ void blake2b_final(uint32_t *out, int out_len, uint64_t *h, uint32_t *buf, int buf_len, int thr_id)
|
||||
{
|
||||
int left = BLOCK_BYTES - buf_len;
|
||||
uint32_t *cursor_out = buf + buf_len;
|
||||
|
||||
for(int i=0; i < (left >> 2); i++, cursor_out += 4) {
|
||||
cursor_out[thr_id] = 0;
|
||||
}
|
||||
|
||||
if(thr_id == 0) {
|
||||
for (int i = 0; i < (left % 4); i++) {
|
||||
cursor_out[i] = 0;
|
||||
}
|
||||
blake2b_incrementCounter(h, buf_len);
|
||||
}
|
||||
|
||||
blake2b_compress(h, (uint64_t*)buf, 0xFFFFFFFFFFFFFFFF, thr_id);
|
||||
|
||||
uint32_t *cursor_in = (uint32_t *)h;
|
||||
cursor_out = out;
|
||||
|
||||
for(int i=0; i < (out_len >> 2); i++, cursor_in += 4, cursor_out += 4) {
|
||||
cursor_out[thr_id] = cursor_in[thr_id];
|
||||
}
|
||||
|
||||
if(thr_id == 0) {
|
||||
for (int i = 0; i < (out_len % 4); i++) {
|
||||
cursor_out[i] = cursor_in[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
__device__ void blake2b_digestLong(uint32_t *out, int out_len, uint32_t *in, int in_len, int thr_id, uint32_t *shared)
|
||||
{
|
||||
uint64_t *h = (uint64_t*)shared;
|
||||
uint32_t *buf = (uint32_t*)&h[10];
|
||||
uint32_t *out_buffer = &buf[32];
|
||||
int buf_len;
|
||||
|
||||
if(thr_id == 0) buf[0] = (out_len * 4);
|
||||
buf_len = 1;
|
||||
|
||||
if (out_len <= OUT_BYTES) {
|
||||
blake2b_init(h, out_len, thr_id);
|
||||
buf_len = blake2b_update(in, in_len, h, buf, buf_len, thr_id);
|
||||
blake2b_final(out, out_len, h, buf, buf_len, thr_id);
|
||||
} else {
|
||||
uint32_t *cursor_in = out_buffer;
|
||||
uint32_t *cursor_out = out;
|
||||
|
||||
blake2b_init(h, OUT_BYTES, thr_id);
|
||||
buf_len = blake2b_update(in, in_len, h, buf, buf_len, thr_id);
|
||||
blake2b_final(out_buffer, OUT_BYTES, h, buf, buf_len, thr_id);
|
||||
|
||||
for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) {
|
||||
cursor_out[thr_id] = cursor_in[thr_id];
|
||||
}
|
||||
|
||||
out += OUT_BYTES / 2;
|
||||
|
||||
int to_produce = out_len - OUT_BYTES / 2;
|
||||
while (to_produce > OUT_BYTES) {
|
||||
buf_len = blake2b_init(h, OUT_BYTES, thr_id);
|
||||
buf_len = blake2b_update(out_buffer, OUT_BYTES, h, buf, buf_len, thr_id);
|
||||
blake2b_final(out_buffer, OUT_BYTES, h, buf, buf_len, thr_id);
|
||||
|
||||
cursor_out = out;
|
||||
cursor_in = out_buffer;
|
||||
for(int i=0; i < (OUT_BYTES / 8); i++, cursor_in += 4, cursor_out += 4) {
|
||||
cursor_out[thr_id] = cursor_in[thr_id];
|
||||
}
|
||||
|
||||
out += OUT_BYTES / 2;
|
||||
to_produce -= OUT_BYTES / 2;
|
||||
}
|
||||
|
||||
buf_len = blake2b_init(h, to_produce, thr_id);
|
||||
buf_len = blake2b_update(out_buffer, OUT_BYTES, h, buf, buf_len, thr_id);
|
||||
blake2b_final(out, to_produce, h, buf, buf_len, thr_id);
|
||||
}
|
||||
}
|
340
src/crypto/argon2_hasher/hash/gpu/cuda/cuda_hasher.cpp
Normal file
340
src/crypto/argon2_hasher/hash/gpu/cuda/cuda_hasher.cpp
Normal file
|
@ -0,0 +1,340 @@
|
|||
//
|
||||
// Created by Haifa Bogdan Adnan on 03/08/2018.
|
||||
//
|
||||
|
||||
#include <crypto/Argon2_constants.h>
|
||||
|
||||
#include "../../../common/common.h"
|
||||
|
||||
#include "crypto/argon2_hasher/hash/Hasher.h"
|
||||
#include "crypto/argon2_hasher/hash/argon2/Argon2.h"
|
||||
|
||||
#if defined(WITH_CUDA)
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <driver_types.h>
|
||||
|
||||
#include "cuda_hasher.h"
|
||||
#include "../../../common/DLLExport.h"
|
||||
|
||||
cuda_hasher::cuda_hasher() {
|
||||
m_type = "GPU";
|
||||
m_subType = "CUDA";
|
||||
m_shortSubType = "NVD";
|
||||
m_intensity = 0;
|
||||
m_description = "";
|
||||
m_computingThreads = 0;
|
||||
}
|
||||
|
||||
|
||||
cuda_hasher::~cuda_hasher() {
|
||||
this->cleanup();
|
||||
}
|
||||
|
||||
bool cuda_hasher::initialize(xmrig::Algo algorithm, xmrig::Variant variant) {
|
||||
cudaError_t error = cudaSuccess;
|
||||
string error_message;
|
||||
|
||||
m_profile = getArgon2Profile(algorithm, variant);
|
||||
|
||||
__devices = __query_cuda_devices(error, error_message);
|
||||
|
||||
if(error != cudaSuccess) {
|
||||
m_description = "No compatible GPU detected: " + error_message;
|
||||
return false;
|
||||
}
|
||||
|
||||
if (__devices.empty()) {
|
||||
m_description = "No compatible GPU detected.";
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
vector<cuda_device_info *> cuda_hasher::__query_cuda_devices(cudaError_t &error, string &error_message) {
|
||||
vector<cuda_device_info *> devices;
|
||||
int devCount = 0;
|
||||
error = cudaGetDeviceCount(&devCount);
|
||||
|
||||
if(error != cudaSuccess) {
|
||||
error_message = "Error querying CUDA device count.";
|
||||
return devices;
|
||||
}
|
||||
|
||||
if(devCount == 0)
|
||||
return devices;
|
||||
|
||||
for (int i = 0; i < devCount; ++i)
|
||||
{
|
||||
cuda_device_info *dev = __get_device_info(i);
|
||||
if(dev == NULL)
|
||||
continue;
|
||||
if(dev->error != cudaSuccess) {
|
||||
error = dev->error;
|
||||
error_message = dev->error_message;
|
||||
continue;
|
||||
}
|
||||
devices.push_back(dev);
|
||||
}
|
||||
return devices;
|
||||
}
|
||||
|
||||
cuda_device_info *cuda_hasher::__get_device_info(int device_index) {
|
||||
cuda_device_info *device_info = new cuda_device_info();
|
||||
device_info->error = cudaSuccess;
|
||||
device_info->cuda_index = device_index;
|
||||
|
||||
device_info->error = cudaSetDevice(device_index);
|
||||
if(device_info->error != cudaSuccess) {
|
||||
device_info->error_message = "Error setting current device.";
|
||||
return device_info;
|
||||
}
|
||||
|
||||
cudaDeviceProp devProp;
|
||||
device_info->error = cudaGetDeviceProperties(&devProp, device_index);
|
||||
if(device_info->error != cudaSuccess) {
|
||||
device_info->error_message = "Error setting current device.";
|
||||
return device_info;
|
||||
}
|
||||
|
||||
device_info->device_string = devProp.name;
|
||||
|
||||
size_t freemem, totalmem;
|
||||
device_info->error = cudaMemGetInfo(&freemem, &totalmem);
|
||||
if(device_info->error != cudaSuccess) {
|
||||
device_info->error_message = "Error setting current device.";
|
||||
return device_info;
|
||||
}
|
||||
|
||||
device_info->free_mem_size = freemem;
|
||||
device_info->max_allocable_mem_size = freemem / 4;
|
||||
|
||||
double mem_in_gb = totalmem / 1073741824.0;
|
||||
stringstream ss;
|
||||
ss << setprecision(2) << mem_in_gb;
|
||||
device_info->device_string += (" (" + ss.str() + "GB)");
|
||||
|
||||
return device_info;
|
||||
}
|
||||
|
||||
bool cuda_hasher::configure(xmrig::HasherConfig &config) {
|
||||
int index = config.getGPUCardsCount();
|
||||
double intensity = 0;
|
||||
|
||||
int total_threads = 0;
|
||||
intensity = config.getAverageGPUIntensity();
|
||||
|
||||
if (intensity == 0) {
|
||||
m_intensity = 0;
|
||||
m_description = "Status: DISABLED - by user.";
|
||||
return false;
|
||||
}
|
||||
|
||||
bool cards_selected = false;
|
||||
intensity = 0;
|
||||
|
||||
for(vector<cuda_device_info *>::iterator d = __devices.begin(); d != __devices.end(); d++, index++) {
|
||||
stringstream ss;
|
||||
ss << "["<< (index + 1) << "] " << (*d)->device_string;
|
||||
string device_description = ss.str();
|
||||
(*d)->device_index = index;
|
||||
(*d)->profile_info.profile = m_profile;
|
||||
|
||||
if(config.gpuFilter().size() > 0) {
|
||||
bool found = false;
|
||||
for(xmrig::GPUFilter fit : config.gpuFilter()) {
|
||||
if(device_description.find(fit.filter) != string::npos) {
|
||||
found = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(!found) {
|
||||
(*d)->profile_info.threads = 0;
|
||||
ss << " - DISABLED" << endl;
|
||||
m_description += ss.str();
|
||||
continue;
|
||||
}
|
||||
else {
|
||||
cards_selected = true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
cards_selected = true;
|
||||
}
|
||||
|
||||
ss << endl;
|
||||
|
||||
double device_intensity = config.getGPUIntensity((*d)->device_index);
|
||||
|
||||
m_description += ss.str();
|
||||
|
||||
if(!(__setup_device_info((*d), device_intensity))) {
|
||||
m_description += (*d)->error_message;
|
||||
m_description += "\n";
|
||||
continue;
|
||||
};
|
||||
|
||||
DeviceInfo device;
|
||||
|
||||
char bus_id[100];
|
||||
if(cudaDeviceGetPCIBusId(bus_id, 100, (*d)->cuda_index) == cudaSuccess) {
|
||||
device.bus_id = bus_id;
|
||||
int domain_separator = device.bus_id.find(":");
|
||||
if(domain_separator != string::npos) {
|
||||
device.bus_id.erase(0, domain_separator + 1);
|
||||
}
|
||||
}
|
||||
|
||||
device.name = (*d)->device_string;
|
||||
device.intensity = device_intensity;
|
||||
storeDeviceInfo((*d)->device_index, device);
|
||||
|
||||
__enabledDevices.push_back(*d);
|
||||
|
||||
total_threads += (*d)->profile_info.threads;
|
||||
intensity += device_intensity;
|
||||
}
|
||||
|
||||
config.addGPUCardsCount(index - config.getGPUCardsCount());
|
||||
|
||||
if(!cards_selected) {
|
||||
m_intensity = 0;
|
||||
m_description += "Status: DISABLED - no card enabled because of filtering.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if (total_threads == 0) {
|
||||
m_intensity = 0;
|
||||
m_description += "Status: DISABLED - not enough resources.";
|
||||
return false;
|
||||
}
|
||||
|
||||
if(!buildThreadData())
|
||||
return false;
|
||||
|
||||
m_intensity = intensity / __enabledDevices.size();
|
||||
m_computingThreads = __enabledDevices.size() * 2; // 2 computing threads for each device
|
||||
m_description += "Status: ENABLED - with " + to_string(total_threads) + " threads.";
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void cuda_hasher::cleanup() {
|
||||
for(vector<cuda_device_info *>::iterator d = __devices.begin(); d != __devices.end(); d++) {
|
||||
cuda_free(*d);
|
||||
}
|
||||
}
|
||||
|
||||
bool cuda_hasher::__setup_device_info(cuda_device_info *device, double intensity) {
|
||||
device->profile_info.threads_per_chunk = (uint32_t)(device->max_allocable_mem_size / device->profile_info.profile->memSize);
|
||||
size_t chunk_size = device->profile_info.threads_per_chunk * device->profile_info.profile->memSize;
|
||||
|
||||
if(chunk_size == 0) {
|
||||
device->error = cudaErrorInitializationError;
|
||||
device->error_message = "Not enough memory on GPU.";
|
||||
return false;
|
||||
}
|
||||
|
||||
uint64_t usable_memory = device->free_mem_size;
|
||||
double chunks = (double)usable_memory / (double)chunk_size;
|
||||
|
||||
uint32_t max_threads = (uint32_t)(device->profile_info.threads_per_chunk * chunks);
|
||||
|
||||
if(max_threads == 0) {
|
||||
device->error = cudaErrorInitializationError;
|
||||
device->error_message = "Not enough memory on GPU.";
|
||||
return false;
|
||||
}
|
||||
|
||||
device->profile_info.threads = (uint32_t)(max_threads * intensity / 100.0);
|
||||
device->profile_info.threads = (device->profile_info.threads / 2) * 2; // make it divisible by 2 to allow for parallel kernel execution
|
||||
if(max_threads > 0 && device->profile_info.threads == 0 && intensity > 0)
|
||||
device->profile_info.threads = 2;
|
||||
|
||||
chunks = (double)device->profile_info.threads / (double)device->profile_info.threads_per_chunk;
|
||||
|
||||
cuda_allocate(device, chunks, chunk_size);
|
||||
|
||||
if(device->error != cudaSuccess)
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool cuda_hasher::buildThreadData() {
|
||||
__thread_data = new cuda_gpumgmt_thread_data[__enabledDevices.size() * 2];
|
||||
|
||||
for(int i=0; i < __enabledDevices.size(); i++) {
|
||||
cuda_device_info *device = __enabledDevices[i];
|
||||
for(int threadId = 0; threadId < 2; threadId ++) {
|
||||
cuda_gpumgmt_thread_data &thread_data = __thread_data[i * 2 + threadId];
|
||||
thread_data.device = device;
|
||||
thread_data.thread_id = threadId;
|
||||
|
||||
cudaStream_t stream;
|
||||
device->error = cudaStreamCreate(&stream);
|
||||
if(device->error != cudaSuccess) {
|
||||
LOG("Error running kernel: (" + to_string(device->error) + ") cannot create cuda stream.");
|
||||
return false;
|
||||
}
|
||||
|
||||
thread_data.device_data = stream;
|
||||
|
||||
#ifdef PARALLEL_CUDA
|
||||
if(threadId == 0) {
|
||||
thread_data.threads_idx = 0;
|
||||
thread_data.threads = device->profile_info.threads / 2;
|
||||
}
|
||||
else {
|
||||
thread_data.threads_idx = device->profile_info.threads / 2;
|
||||
thread_data.threads = device->profile_info.threads - thread_data.threads_idx;
|
||||
}
|
||||
#else
|
||||
thread_data.threads_idx = 0;
|
||||
thread_data.threads = device->profile_info.threads;
|
||||
#endif
|
||||
|
||||
thread_data.argon2 = new Argon2(cuda_kernel_prehasher, cuda_kernel_filler, cuda_kernel_posthasher,
|
||||
nullptr, &thread_data);
|
||||
thread_data.argon2->setThreads(thread_data.threads);
|
||||
thread_data.hashData.outSize = xmrig::ARGON2_HASHLEN + 4;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int cuda_hasher::compute(int threadIdx, uint8_t *input, size_t size, uint8_t *output) {
|
||||
cuda_gpumgmt_thread_data &threadData = __thread_data[threadIdx];
|
||||
|
||||
cudaSetDevice(threadData.device->cuda_index);
|
||||
|
||||
threadData.hashData.input = input;
|
||||
threadData.hashData.inSize = size;
|
||||
threadData.hashData.output = output;
|
||||
int hashCount = threadData.argon2->generateHashes(*m_profile, threadData.hashData);
|
||||
if(threadData.device->error != cudaSuccess) {
|
||||
LOG("Error running kernel: (" + to_string(threadData.device->error) + ")" + threadData.device->error_message);
|
||||
return 0;
|
||||
}
|
||||
|
||||
uint32_t *nonce = ((uint32_t *)(((uint8_t*)threadData.hashData.input) + 39));
|
||||
(*nonce) += threadData.threads;
|
||||
|
||||
return hashCount;
|
||||
|
||||
}
|
||||
|
||||
size_t cuda_hasher::parallelism(int workerIdx) {
|
||||
cuda_gpumgmt_thread_data &threadData = __thread_data[workerIdx];
|
||||
return threadData.threads;
|
||||
}
|
||||
|
||||
size_t cuda_hasher::deviceCount() {
|
||||
return __enabledDevices.size();
|
||||
}
|
||||
|
||||
REGISTER_HASHER(cuda_hasher);
|
||||
|
||||
#endif //WITH_CUDA
|
126
src/crypto/argon2_hasher/hash/gpu/cuda/cuda_hasher.h
Normal file
126
src/crypto/argon2_hasher/hash/gpu/cuda/cuda_hasher.h
Normal file
|
@ -0,0 +1,126 @@
|
|||
//
|
||||
// Created by Haifa Bogdan Adnan on 18/09/2018.
|
||||
//
|
||||
|
||||
#ifndef ARGON2_CUDA_HASHER_H
|
||||
#define ARGON2_CUDA_HASHER_H
|
||||
|
||||
#if defined(WITH_CUDA)
|
||||
|
||||
struct cuda_kernel_arguments {
|
||||
void *memory_chunk_0;
|
||||
void *memory_chunk_1;
|
||||
void *memory_chunk_2;
|
||||
void *memory_chunk_3;
|
||||
void *memory_chunk_4;
|
||||
void *memory_chunk_5;
|
||||
|
||||
uint32_t *refs;
|
||||
uint32_t *idxs;
|
||||
uint32_t *segments;
|
||||
|
||||
uint32_t *preseed_memory[2];
|
||||
uint32_t *seed_memory[2];
|
||||
uint32_t *out_memory[2];
|
||||
uint32_t *hash_memory[2];
|
||||
|
||||
uint32_t *host_seed_memory[2];
|
||||
};
|
||||
|
||||
struct argon2profile_info {
|
||||
argon2profile_info() {
|
||||
threads = 0;
|
||||
threads_per_chunk = 0;
|
||||
}
|
||||
uint32_t threads;
|
||||
uint32_t threads_per_chunk;
|
||||
Argon2Profile *profile;
|
||||
};
|
||||
|
||||
struct cuda_device_info {
|
||||
cuda_device_info() {
|
||||
device_index = 0;
|
||||
device_string = "";
|
||||
free_mem_size = 0;
|
||||
max_allocable_mem_size = 0;
|
||||
|
||||
error = cudaSuccess;
|
||||
error_message = "";
|
||||
}
|
||||
|
||||
int device_index;
|
||||
int cuda_index;
|
||||
|
||||
string device_string;
|
||||
uint64_t free_mem_size;
|
||||
uint64_t max_allocable_mem_size;
|
||||
|
||||
argon2profile_info profile_info;
|
||||
cuda_kernel_arguments arguments;
|
||||
|
||||
mutex device_lock;
|
||||
|
||||
cudaError_t error;
|
||||
string error_message;
|
||||
};
|
||||
|
||||
struct cuda_gpumgmt_thread_data {
|
||||
void lock() {
|
||||
#ifndef PARALLEL_CUDA
|
||||
device->device_lock.lock();
|
||||
#endif
|
||||
}
|
||||
|
||||
void unlock() {
|
||||
#ifndef PARALLEL_CUDA
|
||||
device->device_lock.unlock();
|
||||
#endif
|
||||
}
|
||||
|
||||
int thread_id;
|
||||
cuda_device_info *device;
|
||||
Argon2 *argon2;
|
||||
HashData hashData;
|
||||
|
||||
void *device_data;
|
||||
|
||||
int threads;
|
||||
int threads_idx;
|
||||
};
|
||||
|
||||
class cuda_hasher : public Hasher {
|
||||
public:
|
||||
cuda_hasher();
|
||||
~cuda_hasher();
|
||||
|
||||
virtual bool initialize(xmrig::Algo algorithm, xmrig::Variant variant);
|
||||
virtual bool configure(xmrig::HasherConfig &config);
|
||||
virtual void cleanup();
|
||||
virtual int compute(int threadIdx, uint8_t *input, size_t size, uint8_t *output);
|
||||
virtual size_t parallelism(int workerIdx);
|
||||
virtual size_t deviceCount();
|
||||
|
||||
private:
|
||||
cuda_device_info *__get_device_info(int device_index);
|
||||
bool __setup_device_info(cuda_device_info *device, double intensity);
|
||||
vector<cuda_device_info*> __query_cuda_devices(cudaError_t &error, string &error_message);
|
||||
bool buildThreadData();
|
||||
|
||||
vector<cuda_device_info*> __devices;
|
||||
vector<cuda_device_info*> __enabledDevices;
|
||||
cuda_gpumgmt_thread_data *__thread_data;
|
||||
|
||||
Argon2Profile *m_profile;
|
||||
};
|
||||
|
||||
// CUDA kernel exports
|
||||
extern void cuda_allocate(cuda_device_info *device, double chunks, size_t chunk_size);
|
||||
extern void cuda_free(cuda_device_info *device);
|
||||
extern bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, void *user_data);
|
||||
extern void *cuda_kernel_filler(int threads, Argon2Profile *profile, void *user_data);
|
||||
extern bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, void *user_data);
|
||||
// end CUDA kernel exports
|
||||
|
||||
#endif //WITH_CUDA
|
||||
|
||||
#endif //ARGON2_CUDA_HASHER_H
|
1132
src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
Normal file
1132
src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue