From 05758cda3604f8931bb5da9791fdf41a8a7ab40b Mon Sep 17 00:00:00 2001
From: Haifa Bogdan Adnan <bogdan_adnan@yahoo.com>
Date: Fri, 6 Sep 2019 12:29:30 +0300
Subject: [PATCH 1/5] Add cudaSyncronizeStream and cuda error check for kernel
 run.

---
 .../hash/gpu/cuda/CudaHasher.cpp              |  2 +
 .../hash/gpu/cuda/cuda_kernel.cu              | 44 ++++++++++++++++---
 2 files changed, 40 insertions(+), 6 deletions(-)
diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp b/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp
index 60a6cbfd..6be1f621 100644
--- a/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp
+++ b/src/crypto/argon2_hasher/hash/gpu/cuda/CudaHasher.cpp
@@ -53,6 +53,8 @@ bool CudaHasher::initialize(xmrig::Algo algorithm, xmrig::Variant variant) {
 }
 
 vector<CudaDeviceInfo *> CudaHasher::queryCudaDevices(cudaError_t &error, string &error_message) {
+    cudaSetDeviceFlags(cudaDeviceBlockingSync);
+
 	vector<CudaDeviceInfo *> devices;
 	int devCount = 0;
 	error = cudaGetDeviceCount(&devCount);
diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
index c1f2d8a7..92530e39 100644
--- a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
+++ b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
@@ -1039,6 +1039,18 @@ void cuda_free(CudaDeviceInfo *device) {
 	cudaDeviceReset();
 }
 
+inline bool cudaCheckError(cudaError &err, string &errStr)
+{
+    err = cudaGetLastError();
+    if ( cudaSuccess != err )
+    {
+        errStr = string("CUDA error: ") + cudaGetErrorString( err );
+        return false;
+    }
+
+    return true;
+}
+
 bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, void *user_data) {
     CudaGpuMgmtThreadData *gpumgmt_thread = (CudaGpuMgmtThreadData *)user_data;
     CudaDeviceInfo *device = gpumgmt_thread->device;
@@ -1069,6 +1081,12 @@ bool cuda_kernel_prehasher(void *memory, int threads, Argon2Profile *profile, vo
 			profile->saltLen,
             threads);
 
+    bool success = cudaCheckError(device->error, device->errorMessage);
+    if(!success) {
+        gpumgmt_thread->unlock();
+        return false;
+    }
+
     return true;
 }
 
@@ -1098,7 +1116,13 @@ void *cuda_kernel_filler(int threads, Argon2Profile *profile, void *user_data) {
 			device->profileInfo.threads_per_chunk,
             gpumgmt_thread->threadsIdx);
 
-	return (void *)1;
+	bool success = cudaCheckError(device->error, device->errorMessage);
+	if(!success) {
+        gpumgmt_thread->unlock();
+        return NULL;
+    }
+
+ 	return  (void *)1;
 }
 
 bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, void *user_data) {
@@ -1113,6 +1137,11 @@ bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, v
             device->arguments.outMemory[gpumgmt_thread->threadId],
             device->arguments.preseedMemory[gpumgmt_thread->threadId]);
 
+    if(!cudaCheckError(device->error, device->errorMessage)) {
+        gpumgmt_thread->unlock();
+        return false;
+    }
+
 	device->error = cudaMemcpyAsync(device->arguments.hostSeedMemory[gpumgmt_thread->threadId], device->arguments.hashMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4), cudaMemcpyDeviceToHost, stream);
 	if (device->error != cudaSuccess) {
 		device->errorMessage = "Error reading gpu memory.";
@@ -1120,13 +1149,16 @@ bool cuda_kernel_posthasher(void *memory, int threads, Argon2Profile *profile, v
 		return false;
 	}
 
-	while(cudaStreamQuery(stream) != cudaSuccess) {
-		this_thread::sleep_for(chrono::milliseconds(10));
-		continue;
-	}
+    cudaStreamSynchronize(stream);
+
+    bool success = cudaCheckError(device->error, device->errorMessage);
+    if(!success) {
+        gpumgmt_thread->unlock();
+        return false;
+    }
 
     memcpy(memory, device->arguments.hostSeedMemory[gpumgmt_thread->threadId], threads * (xmrig::ARGON2_HASHLEN + 4));
 	gpumgmt_thread->unlock();
 
-	return memory;
+	return true;
 }
\ No newline at end of file

From 329997a40aa6119dd5a41bd2b58ca9a5f8fccfe3 Mon Sep 17 00:00:00 2001
From: Haifa Bogdan Adnan <bogdan_adnan@yahoo.com>
Date: Fri, 6 Sep 2019 15:07:36 +0300
Subject: [PATCH 2/5] Fix for GTX 20XX series giving rejected shares.

---
 src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu
index db94e488..47d2c3d4 100644
--- a/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu
+++ b/src/crypto/argon2_hasher/hash/gpu/cuda/blake2b.cu
@@ -288,6 +288,8 @@ __device__ __forceinline__ void blake2b_final(uint32_t *out, int out_len, uint64
 
     blake2b_compress(h, (uint64_t*)buf, 0xFFFFFFFFFFFFFFFF, thr_id);
 
+    __syncthreads();
+
     uint32_t *cursor_in = (uint32_t *)h;
     cursor_out = out;
 

From c864c27f1377369280d74b8f555e81589139c158 Mon Sep 17 00:00:00 2001
From: Haifa Bogdan Adnan <bogdan_adnan@yahoo.com>
Date: Fri, 6 Sep 2019 15:22:53 +0300
Subject: [PATCH 3/5] Fix for cpu optimization not being properly saved to
 config file.

---
 src/core/Config.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/core/Config.cpp b/src/core/Config.cpp
index 3b371e04..8d292ae9 100644
--- a/src/core/Config.cpp
+++ b/src/core/Config.cpp
@@ -86,7 +86,7 @@ void xmrig::Config::getJSON(rapidjson::Document &doc) const
     if(cpuOptimization().isNull() || cpuOptimization().isEmpty())
         doc.AddMember("cpu-optimization", kNullType, allocator);
     else
-        doc.AddMember("cpu-optimization", StringRef(cpuOptimization().data()), allocator);
+        doc.AddMember("cpu-optimization", cpuOptimization().toJSON(doc), allocator);
 
     if (cpuAffinity() != -1L) {
         snprintf(affinity_tmp, sizeof(affinity_tmp) - 1, "0x%" PRIX64, cpuAffinity());

From d8daeda7bab3dcea754283f821431ef3d2811b94 Mon Sep 17 00:00:00 2001
From: Haifa Bogdan Adnan <bogdan_adnan@yahoo.com>
Date: Fri, 6 Sep 2019 16:28:55 +0300
Subject: [PATCH 4/5] Added support for CUDA compute capability 3.0. Multiple
 CUDA architecture compilation support.

---
 CMakeLists.txt                                |   8 +-
 .../hash/gpu/cuda/cuda_kernel.cu              | 140 +++++++++++-------
 2 files changed, 96 insertions(+), 52 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ed6597d0..acb62108 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -394,7 +394,13 @@ if(WITH_CUDA)
     set(
             CUDA_NVCC_FLAGS
             ${CUDA_NVCC_FLAGS};
-            -O3 -arch=compute_35 -std=c++11
+            -O3 -std=c++11 --ptxas-options=-v
+            -gencode=arch=compute_75,code="sm_75,compute_75"
+            -gencode=arch=compute_61,code="sm_61,compute_61"
+            -gencode=arch=compute_52,code="sm_52,compute_52"
+            -gencode=arch=compute_50,code="sm_50,compute_50"
+            -gencode=arch=compute_35,code="sm_35,compute_35"
+            -gencode=arch=compute_30,code="sm_30,compute_30"
     )
     cuda_add_library(cuda_hasher MODULE ${SOURCE_CUDA_HASHER})
     set_target_properties(cuda_hasher
diff --git a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
index 92530e39..b9bad2b0 100644
--- a/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
+++ b/src/crypto/argon2_hasher/hash/gpu/cuda/cuda_kernel.cu
@@ -19,57 +19,95 @@
 
 #include "blake2b.cu"
 
-#define COMPUTE	\
-	asm ("{"	\
-		".reg .u32 s1, s2, s3, s4;\n\t"	\
-		"mul.lo.u32 s3, %0, %2;\n\t"	\
-		"mul.hi.u32 s4, %0, %2;\n\t"	\
-		"add.cc.u32 s3, s3, s3;\n\t"	\
-		"addc.u32 s4, s4, s4;\n\t"	\
-		"add.cc.u32 s1, %0, %2;\n\t"	\
-		"addc.u32 s2, %1, %3;\n\t"	\
-		"add.cc.u32 %0, s1, s3;\n\t"	\
-		"addc.u32 %1, s2, s4;\n\t"	\
-		"xor.b32 s1, %0, %6;\n\t"	\
-		"xor.b32 %6, %1, %7;\n\t"	\
-		"mov.b32 %7, s1;\n\t"	\
-		"mul.lo.u32 s3, %4, %6;\n\t"	\
-		"mul.hi.u32 s4, %4, %6;\n\t"	\
-		"add.cc.u32 s3, s3, s3;\n\t"	\
-		"addc.u32 s4, s4, s4;\n\t"	\
-		"add.cc.u32 s1, %4, %6;\n\t"	\
-		"addc.u32 s2, %5, %7;\n\t"	\
-		"add.cc.u32 %4, s1, s3;\n\t"	\
-		"addc.u32 %5, s2, s4;\n\t"	\
-		"xor.b32 s3, %2, %4;\n\t"	\
-		"xor.b32 s4, %3, %5;\n\t"	\
-		"shf.r.wrap.b32 %3, s4, s3, 24;\n\t"	\
-		"shf.r.wrap.b32 %2, s3, s4, 24;\n\t"	\
-		"mul.lo.u32 s3, %0, %2;\n\t"	\
-		"mul.hi.u32 s4, %0, %2;\n\t"	\
-		"add.cc.u32 s3, s3, s3;\n\t"	\
-		"addc.u32 s4, s4, s4;\n\t"	\
-		"add.cc.u32 s1, %0, %2;\n\t"	\
-		"addc.u32 s2, %1, %3;\n\t"	\
-		"add.cc.u32 %0, s1, s3;\n\t"	\
-		"addc.u32 %1, s2, s4;\n\t"	\
-		"xor.b32 s3, %0, %6;\n\t"	\
-		"xor.b32 s4, %1, %7;\n\t"	\
-		"shf.r.wrap.b32 %7, s4, s3, 16;\n\t"	\
-		"shf.r.wrap.b32 %6, s3, s4, 16;\n\t"	\
-		"mul.lo.u32 s3, %4, %6;\n\t"	\
-		"mul.hi.u32 s4, %4, %6;\n\t"	\
-		"add.cc.u32 s3, s3, s3;\n\t"	\
-		"addc.u32 s4, s4, s4;\n\t"	\
-		"add.cc.u32 s1, %4, %6;\n\t"	\
-		"addc.u32 s2, %5, %7;\n\t"	\
-		"add.cc.u32 %4, s1, s3;\n\t"	\
-		"addc.u32 %5, s2, s4;\n\t"	\
-		"xor.b32 s3, %2, %4;\n\t"	\
-		"xor.b32 s4, %3, %5;\n\t"	\
-		"shf.r.wrap.b32 %3, s3, s4, 31;\n\t"	\
-		"shf.r.wrap.b32 %2, s4, s3, 31;\n\t"	\
-	"}" : "+r"(tmp_a.x), "+r"(tmp_a.y), "+r"(tmp_a.z), "+r"(tmp_a.w), "+r"(tmp_b.x), "+r"(tmp_b.y), "+r"(tmp_b.z), "+r"(tmp_b.w));
+#ifndef __CUDA_ARCH__
+#define __CUDA_ARCH__ 0
+#endif
+
+#if (__CUDA_ARCH__ >= 350)
+    #define COMPUTE	\
+        asm ("{"	\
+            ".reg .u32 s1, s2, s3, s4;\n\t"	\
+            "mul.lo.u32 s3, %0, %2;\n\t"	\
+            "mul.hi.u32 s4, %0, %2;\n\t"	\
+            "add.cc.u32 s3, s3, s3;\n\t"	\
+            "addc.u32 s4, s4, s4;\n\t"	\
+            "add.cc.u32 s1, %0, %2;\n\t"	\
+            "addc.u32 s2, %1, %3;\n\t"	\
+            "add.cc.u32 %0, s1, s3;\n\t"	\
+            "addc.u32 %1, s2, s4;\n\t"	\
+            "xor.b32 s1, %0, %6;\n\t"	\
+            "xor.b32 %6, %1, %7;\n\t"	\
+            "mov.b32 %7, s1;\n\t"	\
+            "mul.lo.u32 s3, %4, %6;\n\t"	\
+            "mul.hi.u32 s4, %4, %6;\n\t"	\
+            "add.cc.u32 s3, s3, s3;\n\t"	\
+            "addc.u32 s4, s4, s4;\n\t"	\
+            "add.cc.u32 s1, %4, %6;\n\t"	\
+            "addc.u32 s2, %5, %7;\n\t"	\
+            "add.cc.u32 %4, s1, s3;\n\t"	\
+            "addc.u32 %5, s2, s4;\n\t"	\
+            "xor.b32 s3, %2, %4;\n\t"	\
+            "xor.b32 s4, %3, %5;\n\t"	\
+            "shf.r.wrap.b32 %3, s4, s3, 24;\n\t"	\
+            "shf.r.wrap.b32 %2, s3, s4, 24;\n\t"	\
+            "mul.lo.u32 s3, %0, %2;\n\t"	\
+            "mul.hi.u32 s4, %0, %2;\n\t"	\
+            "add.cc.u32 s3, s3, s3;\n\t"	\
+            "addc.u32 s4, s4, s4;\n\t"	\
+            "add.cc.u32 s1, %0, %2;\n\t"	\
+            "addc.u32 s2, %1, %3;\n\t"	\
+            "add.cc.u32 %0, s1, s3;\n\t"	\
+            "addc.u32 %1, s2, s4;\n\t"	\
+            "xor.b32 s3, %0, %6;\n\t"	\
+            "xor.b32 s4, %1, %7;\n\t"	\
+            "shf.r.wrap.b32 %7, s4, s3, 16;\n\t"	\
+            "shf.r.wrap.b32 %6, s3, s4, 16;\n\t"	\
+            "mul.lo.u32 s3, %4, %6;\n\t"	\
+            "mul.hi.u32 s4, %4, %6;\n\t"	\
+            "add.cc.u32 s3, s3, s3;\n\t"	\
+            "addc.u32 s4, s4, s4;\n\t"	\
+            "add.cc.u32 s1, %4, %6;\n\t"	\
+            "addc.u32 s2, %5, %7;\n\t"	\
+            "add.cc.u32 %4, s1, s3;\n\t"	\
+            "addc.u32 %5, s2, s4;\n\t"	\
+            "xor.b32 s3, %2, %4;\n\t"	\
+            "xor.b32 s4, %3, %5;\n\t"	\
+            "shf.r.wrap.b32 %3, s3, s4, 31;\n\t"	\
+            "shf.r.wrap.b32 %2, s4, s3, 31;\n\t"	\
+        "}" : "+r"(tmp_a.x), "+r"(tmp_a.y), "+r"(tmp_a.z), "+r"(tmp_a.w), "+r"(tmp_b.x), "+r"(tmp_b.y), "+r"(tmp_b.z), "+r"(tmp_b.w));
+#else
+    #define downsample(x, lo, hi) \
+    { \
+        lo = (uint32_t)x; \
+        hi = (uint32_t)(x >> 32); \
+    }
+
+    #define upsample(lo, hi) (((uint64_t)(hi) << 32) | (uint64_t)(lo))
+
+    #define rotate(x, n) (((x) >> (64-n)) | ((x) << n))
+
+    #define fBlaMka(x, y) ((x) + (y) + 2 * upsample((uint32_t)(x) * (uint32_t)y, __umulhi((uint32_t)(x), (uint32_t)(y))))
+
+    #define COMPUTE \
+    { \
+        uint64_t a64 = upsample(tmp_a.x, tmp_a.y); \
+        uint64_t b64 = upsample(tmp_a.z, tmp_a.w); \
+        uint64_t c64 = upsample(tmp_b.x, tmp_b.y); \
+        uint64_t d64 = upsample(tmp_b.z, tmp_b.w); \
+        a64 = fBlaMka(a64, b64);          \
+        d64 = rotate(d64 ^ a64, 32);      \
+        c64 = fBlaMka(c64, d64);          \
+        b64 = rotate(b64 ^ c64, 40);      \
+        a64 = fBlaMka(a64, b64);          \
+        d64 = rotate(d64 ^ a64, 48);      \
+        c64 = fBlaMka(c64, d64);          \
+        b64 = rotate(b64 ^ c64, 1);       \
+        downsample(a64, tmp_a.x, tmp_a.y); \
+        downsample(b64, tmp_a.z, tmp_a.w); \
+        downsample(c64, tmp_b.x, tmp_b.y); \
+        downsample(d64, tmp_b.z, tmp_b.w); \
+    }
+#endif // __CUDA_ARCH__
 
 #define G1(data)           \
 {                           \

From 3f389bbf1b064edf5a4247cdeb9c55ee6db3fcde Mon Sep 17 00:00:00 2001
From: Haifa Bogdan Adnan <bogdanadnan@users.noreply.github.com>
Date: Fri, 6 Sep 2019 20:46:54 +0300
Subject: [PATCH 5/5] Update version.h

---
 src/version.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/version.h b/src/version.h
index 5f183498..68022e07 100644
--- a/src/version.h
+++ b/src/version.h
@@ -28,7 +28,7 @@
 #define APP_ID        "ninjarig"
 #define APP_NAME      "NinjaRig"
 #define APP_DESC      "NinjaRig CPU/GPU miner"
-#define APP_VERSION   "1.0.1"
+#define APP_VERSION   "1.0.2"
 //#define APP_DOMAIN    "xmrig.com"
 //#define APP_SITE      "www.xmrig.com"
 #define APP_COPYRIGHT "Copyright (C) 2019 Haifa Bogdan Adnan"
@@ -36,7 +36,7 @@
 
 #define APP_VER_MAJOR  1
 #define APP_VER_MINOR  0
-#define APP_VER_PATCH  1
+#define APP_VER_PATCH  2
 
 #ifdef _MSC_VER
 #   if (_MSC_VER >= 1920)