commit
4e9134d829
6 changed files with 141 additions and 61 deletions
|
@ -394,7 +394,13 @@ if(WITH_CUDA)
|
||||||
set(
|
set(
|
||||||
CUDA_NVCC_FLAGS
|
CUDA_NVCC_FLAGS
|
||||||
${CUDA_NVCC_FLAGS};
|
${CUDA_NVCC_FLAGS};
|
||||||
-O3 -arch=compute_35 -std=c++11
|
-O3 -std=c++11 --ptxas-options=-v
|
||||||
|
-gencode=arch=compute_75,code="sm_75,compute_75"
|
||||||
|
-gencode=arch=compute_61,code="sm_61,compute_61"
|
||||||
|
-gencode=arch=compute_52,code="sm_52,compute_52"
|
||||||
|
-gencode=arch=compute_50,code="sm_50,compute_50"
|
||||||
|
-gencode=arch=compute_35,code="sm_35,compute_35"
|
||||||
|
-gencode=arch=compute_30,code="sm_30,compute_30"
|
||||||
)
|
)
|
||||||
cuda_add_library(cuda_hasher MODULE ${SOURCE_CUDA_HASHER})
|
cuda_add_library(cuda_hasher MODULE ${SOURCE_CUDA_HASHER})
|
||||||
set_target_properties(cuda_hasher
|
set_target_properties(cuda_hasher
|
||||||
|
|
|
@ -86,7 +86,7 @@ void xmrig::Config::getJSON(rapidjson::Document &doc) const
|
||||||
if(cpuOptimization().isNull() || cpuOptimization().isEmpty())
|
if(cpuOptimization().isNull() || cpuOptimization().isEmpty())
|
||||||
doc.AddMember("cpu-optimization", kNullType, allocator);
|
doc.AddMember("cpu-optimization", kNullType, allocator);
|
||||||
else
|
else
|
||||||
doc.AddMember("cpu-optimization", StringRef(cpuOptimization().data()), allocator);
|
doc.AddMember("cpu-optimization", cpuOptimization().toJSON(doc), allocator);
|
||||||
|
|
||||||
if (cpuAffinity() != -1L) {
|
if (cpuAffinity() != -1L) {
|
||||||
snprintf(affinity_tmp, sizeof(affinity_tmp) - 1, "0x%" PRIX64, cpuAffinity());
|
snprintf(affinity_tmp, sizeof(affinity_tmp) - 1, "0x%" PRIX64, cpuAffinity());
|
||||||
|
|
|
@ -53,6 +53,8 @@ bool CudaHasher::initialize(xmrig::Algo algorithm, xmrig::Variant variant) {
|
||||||
}
|
}
|
||||||
|
|
||||||
vector<CudaDeviceInfo *> CudaHasher::queryCudaDevices(cudaError_t &error, string &error_message) {
|
vector<CudaDeviceInfo *> CudaHasher::queryCudaDevices(cudaError_t &error, string &error_message) {
|
||||||
|
cudaSetDeviceFlags(cudaDeviceBlockingSync);
|
||||||
|
|
||||||
vector<CudaDeviceInfo *> devices;
|
vector<CudaDeviceInfo *> devices;
|
||||||
int devCount = 0;
|
int devCount = 0;
|
||||||
error = cudaGetDeviceCount(&devCount);
|
error = cudaGetDeviceCount(&devCount);
|
||||||
|
|
|
@ -288,6 +288,8 @@ __device__ __forceinline__ void blake2b_final(uint32_t *out, int out_len, uint64
|
||||||
|
|
||||||
blake2b_compress(h, (uint64_t*)buf, 0xFFFFFFFFFFFFFFFF, thr_id);
|
blake2b_compress(h, (uint64_t*)buf, 0xFFFFFFFFFFFFFFFF, thr_id);
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
uint32_t *cursor_in = (uint32_t *)h;
|
uint32_t *cursor_in = (uint32_t *)h;
|
||||||
cursor_out = out;
|
cursor_out = out;
|
||||||
|
|
||||||
|
|
|
@ -19,57 +19,95 @@
|
||||||
|
|
||||||
#include "blake2b.cu"
|
#include "blake2b.cu"
|
||||||
|
|
||||||
#define COMPUTE \
|
#ifndef __CUDA_ARCH__
|
||||||
asm ("{" \
|
#define __CUDA_ARCH__ 0
|
||||||
".reg .u32 s1, s2, s3, s4;\n\t" \
|
#endif
|
||||||
"mul.lo.u32 s3, %0, %2;\n\t" \
|
|
||||||
"mul.hi.u32 s4, %0, %2;\n\t" \
|
#if (__CUDA_ARCH__ >= 350)
|
||||||
"add.cc.u32 s3, s3, s3;\n\t" \
|
#define COMPUTE \
|
||||||
"addc.u32 s4, s4, s4;\n\t" \
|
asm ("{" \
|
||||||
"add.cc.u32 s1, %0, %2;\n\t" \
|
".reg .u32 s1, s2, s3, s4;\n\t" \
|
||||||
"addc.u32 s2, %1, %3;\n\t" \
|
"mul.lo.u32 s3, %0, %2;\n\t" \
|
||||||
"add.cc.u32 %0, s1, s3;\n\t" \
|
"mul.hi.u32 s4, %0, %2;\n\t" \
|
||||||
"addc.u32 %1, s2, s4;\n\t" \
|
"add.cc.u32 s3, s3, s3;\n\t" \
|
||||||
"xor.b32 s1, %0, %6;\n\t" \
|
"addc.u32 s4, s4, s4;\n\t" \
|
||||||
"xor.b32 %6, %1, %7;\n\t" \
|
"add.cc.u32 s1, %0, %2;\n\t" \
|
||||||
"mov.b32 %7, s1;\n\t" \
|
"addc.u32 s2, %1, %3;\n\t" \
|
||||||
"mul.lo.u32 s3, %4, %6;\n\t" \
|
"add.cc.u32 %0, s1, s3;\n\t" \
|
||||||
"mul.hi.u32 s4, %4, %6;\n\t" \
|
"addc.u32 %1, s2, s4;\n\t" \
|
||||||
"add.cc.u32 s3, s3, s3;\n\t" \
|
"xor.b32 s1, %0, %6;\n\t" \
|
||||||
"addc.u32 s4, s4, s4;\n\t" \
|
"xor.b32 %6, %1, %7;\n\t" \
|
||||||
"add.cc.u32 s1, %4, %6;\n\t" \
|
"mov.b32 %7, s1;\n\t" \
|
||||||
"addc.u32 s2, %5, %7;\n\t" \
|
"mul.lo.u32 s3, %4, %6;\n\t" \
|
||||||
"add.cc.u32 %4, s1, s3;\n\t" \
|
"mul.hi.u32 s4, %4, %6;\n\t" \
|
||||||
"addc.u32 %5, s2, s4;\n\t" \
|
"add.cc.u32 s3, s3, s3;\n\t" \
|
||||||
"xor.b32 s3, %2, %4;\n\t" \
|
"addc.u32 s4, s4, s4;\n\t" \
|
||||||
"xor.b32 s4, %3, %5;\n\t" \
|
"add.cc.u32 s1, %4, %6;\n\t" \
|
||||||
"shf.r.wrap.b32 %3, s4, s3, 24;\n\t" \
|
"addc.u32 s2, %5, %7;\n\t" \
|
||||||
"shf.r.wrap.b32 %2, s3, s4, 24;\n\t" \
|
"add.cc.u32 %4, s1, s3;\n\t" \
|
||||||
"mul.lo.u32 s3, %0, %2;\n\t" \
|
"addc.u32 %5, s2, s4;\n\t" \
|
||||||
"mul.hi.u32 s4, %0, %2;\n\t" \
|
"xor.b32 s3, %2, %4;\n\t" \
|
||||||
"add.cc.u32 s3, s3, s3;\n\t" \
|
"xor.b32 s4, %3, %5;\n\t" \
|
||||||
"addc.u32 s4, s4, s4;\n\t" \
|
"shf.r.wrap.b32 %3, s4, s3, 24;\n\t" \
|
||||||
"add.cc.u32 s1, %0, %2;\n\t" \
|
"shf.r.wrap.b32 %2, s3, s4, 24;\n\t" \
|
||||||
"addc.u32 s2, %1, %3;\n\t" \
|
"mul.lo.u32 s3, %0, %2;\n\t" \
|
||||||
"add.cc.u32 %0, s1, s3;\n\t" \
|
"mul.hi.u32 s4, %0, %2;\n\t" \
|
||||||
"addc.u32 %1, s2, s4;\n\t" \
|
"add.cc.u32 s3, s3, s3;\n\t" \
|
||||||
"xor.b32 s3, %0, %6;\n\t" \
|
"addc.u32 s4, s4, s4;\n\t" \
|
||||||
"xor.b32 s4, %1, %7;\n\t" \
|
"add.cc.u32 s1, %0, %2;\n\t" \
|
||||||
"shf.r.wrap.b32 %7, s4, s3, 16;\n\t" \
|
"addc.u32 s2, %1, %3;\n\t" \
|
||||||
"shf.r.wrap.b32 %6, s3, s4, 16;\n\t" \
|
"add.cc.u32 %0, s1, s3;\n\t" \
|
||||||
"mul.lo.u32 s3, %4, %6;\n\t" \
|
"addc.u32 %1, s2, s4;\n\t" \
|
||||||
"mul.hi.u32 s4, %4, %6;\n\t" \
|
"xor.b32 s3, %0, %6;\n\t" \
|
||||||
"add.cc.u32 s3, s3, s3;\n\t" \
|
"xor.b32 s4, %1, %7;\n\t" \
|
||||||
"addc.u32 s4, s4, s4;\n\t" \
|
"shf.r.wrap.b32 %7, s4, s3, 16;\n\t" \
|
||||||
"add.cc.u32 s1, %4, %6;\n\t" \
|
"shf.r.wrap.b32 %6, s3, s4, 16;\n\t" \
|
||||||
"addc.u32 s2, %5, %7;\n\t" \
|
"mul.lo.u32 s3, %4, %6;\n\t" \
|
||||||
"add.cc.u32 %4, s1, s3;\n\t" \
|
"mul.hi.u32 s4, %4, %6;\n\t" \
|
||||||
"addc.u32 %5, s2, s4;\n\t" \
|
"add.cc.u32 s3, s3, s3;\n\t" \
|
||||||
"xor.b32 s3, %2, %4;\n\t" \
|
"addc.u32 s4, s4, s4;\n\t" \
|
||||||
"xor.b32 s4, %3, %5;\n\t" \
|
"add.cc.u32 s1, %4, %6;\n\t" \
|
||||||
"shf.r.wrap.b32 %3, s3, s4, 31;\n\t" \
|
"addc.u32 s2, %5, %7;\n\t" \
|
||||||
"shf.r.wrap.b32 %2, s4, s3, 31;\n\t" \
|
"add.cc.u32 %4, s1, s3;\n\t" \
|
||||||
"}" : "+r"(tmp_a.x), "+r"(tmp_a.y), "+r"(tmp_a.z), "+r"(tmp_a.w), "+r"(tmp_b.x), "+r"(tmp_b.y), "+r"(tmp_b.z), "+r"(tmp_b.w));
|
"addc.u32 %5, s2, s4;\n\t" \
|
||||||
|
"xor.b32 s3, %2, %4;\n\t" \
|
||||||
|
"xor.b32 s4, %3, %5;\n\t" \
|
||||||
|
"shf.r.wrap.b32 %3, s3, s4, 31;\n\t" \
|
||||||
|
"shf.r.wrap.b32 %2, s4, s3, 31;\n\t" \
|
||||||
|
"}" : "+r"(tmp_a.x), "+r"(tmp_a.y), "+r"(tmp_a.z), "+r"(tmp_a.w), "+r"(tmp_b.x), "+r"(tmp_b.y), "+r"(tmp_b.z), "+r"(tmp_b.w));
|
||||||
|
#else
|
||||||
|
#define downsample(x, lo, hi) \
|
||||||
|
{ \
|
||||||
|
lo = (uint32_t)x; \
|
||||||
|
hi = (uint32_t)(x >> 32); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define upsample(lo, hi) (((uint64_t)(hi) << 32) | (uint64_t)(lo))
|
||||||
|
|
||||||
|
#define rotate(x, n) (((x) >> (64-n)) | ((x) << n))
|
||||||
|
|
||||||
|
#define fBlaMka(x, y) ((x) + (y) + 2 * upsample((uint32_t)(x) * (uint32_t)y, __umulhi((uint32_t)(x), (uint32_t)(y))))
|
||||||
|
|
||||||
|
#define COMPUTE \
|
||||||
|
{ \
|
||||||
|
uint64_t a64 = upsample(tmp_a.x, tmp_a.y); \
|
||||||
|
uint64_t b64 = upsample(tmp_a.z, tmp_a.w); \
|
||||||
|
uint64_t c64 = upsample(tmp_b.x, tmp_b.y); \
|
||||||
|
uint64_t d64 = upsample(tmp_b.z, tmp_b.w); \
|
||||||
|
a64 = fBlaMka(a64, b64); \
|
||||||
|
d64 = rotate(d64 ^ a64, 32); \
|
||||||
|
c64 = fBlaMka(c64, d64); \
|
||||||
|
b64 = rotate(b64 ^ c64, 40); \
|
||||||
|
a64 = fBlaMka(a64, b64); \
|
||||||
|
d64 = rotate(d64 ^ a64, 48); \
|
||||||
|
c64 = fBlaMka(c64, d64); \
|
||||||
|
b64 = rotate(b64 ^ c64, 1); \
|
||||||
|
downsample(a64, tmp_a.x, tmp_a.y); \
|
||||||
|
downsample(b64, tmp_a.z, tmp_a.w); \
|
||||||
|
downsample(c64, tmp_b.x, tmp_b.y); \
|
||||||
|
downsample(d64, tmp_b.z, tmp_b.w); \
|
||||||
|
}
|
||||||
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
#define G1(data) \
|
#define G1(data) \
|
||||||
{ \
|
{ \
|
||||||
|
@ -1039,6 +1077,18 @@ void cuda_free(CudaDeviceInfo *device) {
|
||||||
cudaDeviceReset();
|
cudaDeviceReset();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline bool cudaCheckError(cudaError &err, string &errStr)
|
||||||
|
{
|
||||||
|
err = cudaGetLastError();
|
||||||
|
if ( cudaSuccess != err )
|
||||||
|
{
|
||||||
|
errStr = string("CUDA error: ") + cudaGetErrorString( err );
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, void *user_data) {
|
bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, void *user_data) {
|
||||||
CudaGpuMgmtThreadData *gpumgmt_thread = (CudaGpuMgmtThreadData *)user_data;
|
CudaGpuMgmtThreadData *gpumgmt_thread = (CudaGpuMgmtThreadData *)user_data;
|
||||||
CudaDeviceInfo *device = gpumgmt_thread->device;
|
CudaDeviceInfo *device = gpumgmt_thread->device;
|
||||||
|
@ -1069,6 +1119,12 @@ bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, vo
|
||||||
profile->saltLen,
|
profile->saltLen,
|
||||||
threads);
|
threads);
|
||||||
|
|
||||||
|
bool success = cudaCheckError(device->error, device->errorMessage);
|
||||||
|
if(!success) {
|
||||||
|
gpumgmt_thread->unlock();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1098,7 +1154,13 @@ void *cuda_kernel_filler(int threads, Argon2Profile *profile, void *user_data) {
|
||||||
device->profileInfo.threads_per_chunk,
|
device->profileInfo.threads_per_chunk,
|
||||||
gpumgmt_thread->threadsIdx);
|
gpumgmt_thread->threadsIdx);
|
||||||
|
|
||||||
return (void *)1;
|
bool success = cudaCheckError(device->error, device->errorMessage);
|
||||||
|
if(!success) {
|
||||||
|
gpumgmt_thread->unlock();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (void *)1;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, void *user_data) {
|
bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, void *user_data) {
|
||||||
|
@ -1113,6 +1175,11 @@ bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, v
|
||||||
device->arguments.outMemory[gpumgmt_thread->threadId],
|
device->arguments.outMemory[gpumgmt_thread->threadId],
|
||||||
device->arguments.preseedMemory[gpumgmt_thread->threadId]);
|
device->arguments.preseedMemory[gpumgmt_thread->threadId]);
|
||||||
|
|
||||||
|
if(!cudaCheckError(device->error, device->errorMessage)) {
|
||||||
|
gpumgmt_thread->unlock();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
device->error = cudaMemcpyAsync(device->arguments.hostSeedMemory[gpumgmt_thread->threadId], device->arguments.hashMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4), cudaMemcpyDeviceToHost, stream);
|
device->error = cudaMemcpyAsync(device->arguments.hostSeedMemory[gpumgmt_thread->threadId], device->arguments.hashMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4), cudaMemcpyDeviceToHost, stream);
|
||||||
if (device->error != cudaSuccess) {
|
if (device->error != cudaSuccess) {
|
||||||
device->errorMessage = "Error reading gpu memory.";
|
device->errorMessage = "Error reading gpu memory.";
|
||||||
|
@ -1120,13 +1187,16 @@ bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, v
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
while(cudaStreamQuery(stream) != cudaSuccess) {
|
cudaStreamSynchronize(stream);
|
||||||
this_thread::sleep_for(chrono::milliseconds(10));
|
|
||||||
continue;
|
bool success = cudaCheckError(device->error, device->errorMessage);
|
||||||
}
|
if(!success) {
|
||||||
|
gpumgmt_thread->unlock();
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
memcpy(memory, device->arguments.hostSeedMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4));
|
memcpy(memory, device->arguments.hostSeedMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4));
|
||||||
gpumgmt_thread->unlock();
|
gpumgmt_thread->unlock();
|
||||||
|
|
||||||
return memory;
|
return true;
|
||||||
}
|
}
|
|
@ -28,7 +28,7 @@
|
||||||
#define APP_ID "ninjarig"
|
#define APP_ID "ninjarig"
|
||||||
#define APP_NAME "NinjaRig"
|
#define APP_NAME "NinjaRig"
|
||||||
#define APP_DESC "NinjaRig CPU/GPU miner"
|
#define APP_DESC "NinjaRig CPU/GPU miner"
|
||||||
#define APP_VERSION "1.0.1"
|
#define APP_VERSION "1.0.2"
|
||||||
//#define APP_DOMAIN "xmrig.com"
|
//#define APP_DOMAIN "xmrig.com"
|
||||||
//#define APP_SITE "www.xmrig.com"
|
//#define APP_SITE "www.xmrig.com"
|
||||||
#define APP_COPYRIGHT "Copyright (C) 2019 Haifa Bogdan Adnan"
|
#define APP_COPYRIGHT "Copyright (C) 2019 Haifa Bogdan Adnan"
|
||||||
|
@ -36,7 +36,7 @@
|
||||||
|
|
||||||
#define APP_VER_MAJOR 1
|
#define APP_VER_MAJOR 1
|
||||||
#define APP_VER_MINOR 0
|
#define APP_VER_MINOR 0
|
||||||
#define APP_VER_PATCH 1
|
#define APP_VER_PATCH 2
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
# if (_MSC_VER >= 1920)
|
# if (_MSC_VER >= 1920)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue