From 05758cda3604f8931bb5da9791fdf41a8a7ab40b Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Fri, 6 Sep 2019 12:29:30 +0300 Subject: [PATCH 1/5] Add cudaSyncronizeStream and cuda error check for kernel run. --- .../hash/gpu/cuda/CudaHasher.cpp | 2 + .../hash/gpu/cuda/cuda_kernel.cu | 44 ++++++++++++++++--- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp b/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp index 60a6cbfd..6be1f621 100644 --- a/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp +++ b/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp @@ -53,6 +53,8 @@ bool CudaHasher::initialize(xmrig::Algo algorithm, xmrig::Variant variant) { } vector CudaHasher::queryCudaDevices(cudaError_t &error, string &error_message) { + cudaSetDeviceFlags(cudaDeviceBlockingSync); + vector devices; int devCount = 0; error = cudaGetDeviceCount(&devCount); diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu index c1f2d8a7..92530e39 100644 --- a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu +++ b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu @@ -1039,6 +1039,18 @@ void cuda_free(CudaDeviceInfo *device) { cudaDeviceReset(); } +inline bool cudaCheckError(cudaError &err, string &errStr) +{ + err = cudaGetLastError(); + if ( cudaSuccess != err ) + { + errStr = string("CUDA error: ") + cudaGetErrorString( err ); + return false; + } + + return true; +} + bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, void *user_data) { CudaGpuMgmtThreadData *gpumgmt_thread = (CudaGpuMgmtThreadData *)user_data; CudaDeviceInfo *device = gpumgmt_thread->device; @@ -1069,6 +1081,12 @@ bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, vo profile->saltLen, threads); + bool success = cudaCheckError(device->error, device->errorMessage); + if(!success) { + gpumgmt_thread->unlock(); + return false; + } + return true; } @@ -1098,7 +1116,13 @@ void *cuda_kernel_filler(int threads, Argon2Profile *profile, void *user_data) { device->profileInfo.threads_per_chunk, gpumgmt_thread->threadsIdx); - return (void *)1; + bool success = cudaCheckError(device->error, device->errorMessage); + if(!success) { + gpumgmt_thread->unlock(); + return NULL; + } + + return (void *)1; } bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, void *user_data) { @@ -1113,6 +1137,11 @@ bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, v device->arguments.outMemory[gpumgmt_thread->threadId], device->arguments.preseedMemory[gpumgmt_thread->threadId]); + if(!cudaCheckError(device->error, device->errorMessage)) { + gpumgmt_thread->unlock(); + return false; + } + device->error = cudaMemcpyAsync(device->arguments.hostSeedMemory[gpumgmt_thread->threadId], device->arguments.hashMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4), cudaMemcpyDeviceToHost, stream); if (device->error != cudaSuccess) { device->errorMessage = "Error reading gpu memory."; @@ -1120,13 +1149,16 @@ bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, v return false; } - while(cudaStreamQuery(stream) != cudaSuccess) { - this_thread::sleep_for(chrono::milliseconds(10)); - continue; - } + cudaStreamSynchronize(stream); + + bool success = cudaCheckError(device->error, device->errorMessage); + if(!success) { + gpumgmt_thread->unlock(); + return false; + } memcpy(memory, device->arguments.hostSeedMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4)); gpumgmt_thread->unlock(); - return memory; + return true; } \ No newline at end of file From 329997a40aa6119dd5a41bd2b58ca9a5f8fccfe3 Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Fri, 6 Sep 2019 15:07:36 +0300 Subject: [PATCH 2/5] Fix for GTX 20XX series giving rejected shares. --- src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu index db94e488..47d2c3d4 100644 --- a/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu +++ b/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu @@ -288,6 +288,8 @@ __device__ __forceinline__ void blake2b_final(uint32_t *out, int out_len, uint64 blake2b_compress(h, (uint64_t*)buf, 0xFFFFFFFFFFFFFFFF, thr_id); + __syncthreads(); + uint32_t *cursor_in = (uint32_t *)h; cursor_out = out; From c864c27f1377369280d74b8f555e81589139c158 Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Fri, 6 Sep 2019 15:22:53 +0300 Subject: [PATCH 3/5] Fix for cpu optimization not being properly saved to config file. --- src/core/Config.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/core/Config.cpp b/src/core/Config.cpp index 3b371e04..8d292ae9 100644 --- a/src/core/Config.cpp +++ b/src/core/Config.cpp @@ -86,7 +86,7 @@ void xmrig::Config::getJSON(rapidjson::Document &doc) const if(cpuOptimization().isNull() || cpuOptimization().isEmpty()) doc.AddMember("cpu-optimization", kNullType, allocator); else - doc.AddMember("cpu-optimization", StringRef(cpuOptimization().data()), allocator); + doc.AddMember("cpu-optimization", cpuOptimization().toJSON(doc), allocator); if (cpuAffinity() != -1L) { snprintf(affinity_tmp, sizeof(affinity_tmp) - 1, "0x%" PRIX64, cpuAffinity()); From d8daeda7bab3dcea754283f821431ef3d2811b94 Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Fri, 6 Sep 2019 16:28:55 +0300 Subject: [PATCH 4/5] Added support for CUDA compute capability 3.0. Multiple CUDA architecture compilation support. --- CMakeLists.txt | 8 +- .../hash/gpu/cuda/cuda_kernel.cu | 140 +++++++++++------- 2 files changed, 96 insertions(+), 52 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ed6597d0..acb62108 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -394,7 +394,13 @@ if(WITH_CUDA) set( CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; - -O3 -arch=compute_35 -std=c++11 + -O3 -std=c++11 --ptxas-options=-v + -gencode=arch=compute_75,code="sm_75,compute_75" + -gencode=arch=compute_61,code="sm_61,compute_61" + -gencode=arch=compute_52,code="sm_52,compute_52" + -gencode=arch=compute_50,code="sm_50,compute_50" + -gencode=arch=compute_35,code="sm_35,compute_35" + -gencode=arch=compute_30,code="sm_30,compute_30" ) cuda_add_library(cuda_hasher MODULE ${SOURCE_CUDA_HASHER}) set_target_properties(cuda_hasher diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu index 92530e39..b9bad2b0 100644 --- a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu +++ b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu @@ -19,57 +19,95 @@ #include "blake2b.cu" -#define COMPUTE \ - asm ("{" \ - ".reg .u32 s1, s2, s3, s4;\n\t" \ - "mul.lo.u32 s3, %0, %2;\n\t" \ - "mul.hi.u32 s4, %0, %2;\n\t" \ - "add.cc.u32 s3, s3, s3;\n\t" \ - "addc.u32 s4, s4, s4;\n\t" \ - "add.cc.u32 s1, %0, %2;\n\t" \ - "addc.u32 s2, %1, %3;\n\t" \ - "add.cc.u32 %0, s1, s3;\n\t" \ - "addc.u32 %1, s2, s4;\n\t" \ - "xor.b32 s1, %0, %6;\n\t" \ - "xor.b32 %6, %1, %7;\n\t" \ - "mov.b32 %7, s1;\n\t" \ - "mul.lo.u32 s3, %4, %6;\n\t" \ - "mul.hi.u32 s4, %4, %6;\n\t" \ - "add.cc.u32 s3, s3, s3;\n\t" \ - "addc.u32 s4, s4, s4;\n\t" \ - "add.cc.u32 s1, %4, %6;\n\t" \ - "addc.u32 s2, %5, %7;\n\t" \ - "add.cc.u32 %4, s1, s3;\n\t" \ - "addc.u32 %5, s2, s4;\n\t" \ - "xor.b32 s3, %2, %4;\n\t" \ - "xor.b32 s4, %3, %5;\n\t" \ - "shf.r.wrap.b32 %3, s4, s3, 24;\n\t" \ - "shf.r.wrap.b32 %2, s3, s4, 24;\n\t" \ - "mul.lo.u32 s3, %0, %2;\n\t" \ - "mul.hi.u32 s4, %0, %2;\n\t" \ - "add.cc.u32 s3, s3, s3;\n\t" \ - "addc.u32 s4, s4, s4;\n\t" \ - "add.cc.u32 s1, %0, %2;\n\t" \ - "addc.u32 s2, %1, %3;\n\t" \ - "add.cc.u32 %0, s1, s3;\n\t" \ - "addc.u32 %1, s2, s4;\n\t" \ - "xor.b32 s3, %0, %6;\n\t" \ - "xor.b32 s4, %1, %7;\n\t" \ - "shf.r.wrap.b32 %7, s4, s3, 16;\n\t" \ - "shf.r.wrap.b32 %6, s3, s4, 16;\n\t" \ - "mul.lo.u32 s3, %4, %6;\n\t" \ - "mul.hi.u32 s4, %4, %6;\n\t" \ - "add.cc.u32 s3, s3, s3;\n\t" \ - "addc.u32 s4, s4, s4;\n\t" \ - "add.cc.u32 s1, %4, %6;\n\t" \ - "addc.u32 s2, %5, %7;\n\t" \ - "add.cc.u32 %4, s1, s3;\n\t" \ - "addc.u32 %5, s2, s4;\n\t" \ - "xor.b32 s3, %2, %4;\n\t" \ - "xor.b32 s4, %3, %5;\n\t" \ - "shf.r.wrap.b32 %3, s3, s4, 31;\n\t" \ - "shf.r.wrap.b32 %2, s4, s3, 31;\n\t" \ - "}" : "+r"(tmp_a.x), "+r"(tmp_a.y), "+r"(tmp_a.z), "+r"(tmp_a.w), "+r"(tmp_b.x), "+r"(tmp_b.y), "+r"(tmp_b.z), "+r"(tmp_b.w)); +#ifndef __CUDA_ARCH__ +#define __CUDA_ARCH__ 0 +#endif + +#if (__CUDA_ARCH__ >= 350) + #define COMPUTE \ + asm ("{" \ + ".reg .u32 s1, s2, s3, s4;\n\t" \ + "mul.lo.u32 s3, %0, %2;\n\t" \ + "mul.hi.u32 s4, %0, %2;\n\t" \ + "add.cc.u32 s3, s3, s3;\n\t" \ + "addc.u32 s4, s4, s4;\n\t" \ + "add.cc.u32 s1, %0, %2;\n\t" \ + "addc.u32 s2, %1, %3;\n\t" \ + "add.cc.u32 %0, s1, s3;\n\t" \ + "addc.u32 %1, s2, s4;\n\t" \ + "xor.b32 s1, %0, %6;\n\t" \ + "xor.b32 %6, %1, %7;\n\t" \ + "mov.b32 %7, s1;\n\t" \ + "mul.lo.u32 s3, %4, %6;\n\t" \ + "mul.hi.u32 s4, %4, %6;\n\t" \ + "add.cc.u32 s3, s3, s3;\n\t" \ + "addc.u32 s4, s4, s4;\n\t" \ + "add.cc.u32 s1, %4, %6;\n\t" \ + "addc.u32 s2, %5, %7;\n\t" \ + "add.cc.u32 %4, s1, s3;\n\t" \ + "addc.u32 %5, s2, s4;\n\t" \ + "xor.b32 s3, %2, %4;\n\t" \ + "xor.b32 s4, %3, %5;\n\t" \ + "shf.r.wrap.b32 %3, s4, s3, 24;\n\t" \ + "shf.r.wrap.b32 %2, s3, s4, 24;\n\t" \ + "mul.lo.u32 s3, %0, %2;\n\t" \ + "mul.hi.u32 s4, %0, %2;\n\t" \ + "add.cc.u32 s3, s3, s3;\n\t" \ + "addc.u32 s4, s4, s4;\n\t" \ + "add.cc.u32 s1, %0, %2;\n\t" \ + "addc.u32 s2, %1, %3;\n\t" \ + "add.cc.u32 %0, s1, s3;\n\t" \ + "addc.u32 %1, s2, s4;\n\t" \ + "xor.b32 s3, %0, %6;\n\t" \ + "xor.b32 s4, %1, %7;\n\t" \ + "shf.r.wrap.b32 %7, s4, s3, 16;\n\t" \ + "shf.r.wrap.b32 %6, s3, s4, 16;\n\t" \ + "mul.lo.u32 s3, %4, %6;\n\t" \ + "mul.hi.u32 s4, %4, %6;\n\t" \ + "add.cc.u32 s3, s3, s3;\n\t" \ + "addc.u32 s4, s4, s4;\n\t" \ + "add.cc.u32 s1, %4, %6;\n\t" \ + "addc.u32 s2, %5, %7;\n\t" \ + "add.cc.u32 %4, s1, s3;\n\t" \ + "addc.u32 %5, s2, s4;\n\t" \ + "xor.b32 s3, %2, %4;\n\t" \ + "xor.b32 s4, %3, %5;\n\t" \ + "shf.r.wrap.b32 %3, s3, s4, 31;\n\t" \ + "shf.r.wrap.b32 %2, s4, s3, 31;\n\t" \ + "}" : "+r"(tmp_a.x), "+r"(tmp_a.y), "+r"(tmp_a.z), "+r"(tmp_a.w), "+r"(tmp_b.x), "+r"(tmp_b.y), "+r"(tmp_b.z), "+r"(tmp_b.w)); +#else + #define downsample(x, lo, hi) \ + { \ + lo = (uint32_t)x; \ + hi = (uint32_t)(x >> 32); \ + } + + #define upsample(lo, hi) (((uint64_t)(hi) << 32) | (uint64_t)(lo)) + + #define rotate(x, n) (((x) >> (64-n)) | ((x) << n)) + + #define fBlaMka(x, y) ((x) + (y) + 2 * upsample((uint32_t)(x) * (uint32_t)y, __umulhi((uint32_t)(x), (uint32_t)(y)))) + + #define COMPUTE \ + { \ + uint64_t a64 = upsample(tmp_a.x, tmp_a.y); \ + uint64_t b64 = upsample(tmp_a.z, tmp_a.w); \ + uint64_t c64 = upsample(tmp_b.x, tmp_b.y); \ + uint64_t d64 = upsample(tmp_b.z, tmp_b.w); \ + a64 = fBlaMka(a64, b64); \ + d64 = rotate(d64 ^ a64, 32); \ + c64 = fBlaMka(c64, d64); \ + b64 = rotate(b64 ^ c64, 40); \ + a64 = fBlaMka(a64, b64); \ + d64 = rotate(d64 ^ a64, 48); \ + c64 = fBlaMka(c64, d64); \ + b64 = rotate(b64 ^ c64, 1); \ + downsample(a64, tmp_a.x, tmp_a.y); \ + downsample(b64, tmp_a.z, tmp_a.w); \ + downsample(c64, tmp_b.x, tmp_b.y); \ + downsample(d64, tmp_b.z, tmp_b.w); \ + } +#endif // __CUDA_ARCH__ #define G1(data) \ { \ From 3f389bbf1b064edf5a4247cdeb9c55ee6db3fcde Mon Sep 17 00:00:00 2001 From: Haifa Bogdan Adnan Date: Fri, 6 Sep 2019 20:46:54 +0300 Subject: [PATCH 5/5] Update version.h --- src/version.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.h b/src/version.h index 5f183498..68022e07 100644 --- a/src/version.h +++ b/src/version.h @@ -28,7 +28,7 @@ #define APP_ID "ninjarig" #define APP_NAME "NinjaRig" #define APP_DESC "NinjaRig CPU/GPU miner" -#define APP_VERSION "1.0.1" +#define APP_VERSION "1.0.2" //#define APP_DOMAIN "xmrig.com" //#define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2019 Haifa Bogdan Adnan" @@ -36,7 +36,7 @@ #define APP_VER_MAJOR 1 #define APP_VER_MINOR 0 -#define APP_VER_PATCH 1 +#define APP_VER_PATCH 2 #ifdef _MSC_VER # if (_MSC_VER >= 1920)