From 8fe0577d60d10693a15e13766e5b2e9454011bb6 Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 23 Sep 2020 08:06:28 +0700 Subject: [PATCH 01/14] v6.3.5-dev --- src/version.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.h b/src/version.h index 47e95d2d..01108a16 100644 --- a/src/version.h +++ b/src/version.h @@ -28,7 +28,7 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "XMRig miner" -#define APP_VERSION "6.3.4" +#define APP_VERSION "6.3.5-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2020 xmrig.com" @@ -36,7 +36,7 @@ #define APP_VER_MAJOR 6 #define APP_VER_MINOR 3 -#define APP_VER_PATCH 4 +#define APP_VER_PATCH 5 #ifdef _MSC_VER # if (_MSC_VER >= 1920) From 1e26e586602ab0056233e23b5644c41fd86d6760 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Wed, 23 Sep 2020 11:44:08 +0200 Subject: [PATCH 02/14] Fix for ARM compilation --- src/crypto/randomx/randomx.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index 2804b1b7..c20859ed 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -337,12 +337,16 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx INST_HANDLE(FDIV_M, FMUL_R); INST_HANDLE(FSQRT_R, FDIV_M); +#if defined(_M_X64) || defined(__x86_64__) if (xmrig::Cpu::info()->jccErratum()) { INST_HANDLE2(CBRANCH, CBRANCH, FSQRT_R); } else { INST_HANDLE2(CBRANCH, CBRANCH, FSQRT_R); } +#else + INST_HANDLE(CBRANCH, FSQRT_R); +#endif #if defined(_M_X64) || defined(__x86_64__) if (xmrig::Cpu::info()->hasBMI2()) { From bef9031b035eb384c3bade030fa76563f1edf644 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Fri, 25 Sep 2020 10:53:24 +0200 Subject: [PATCH 03/14] KawPow: fixed OpenCL memory leak --- .../opencl/runners/OclKawPowRunner.cpp | 5 +-- src/backend/opencl/runners/OclKawPowRunner.h | 1 - .../opencl/runners/tools/OclKawPow.cpp | 45 +++++++++++-------- src/backend/opencl/runners/tools/OclKawPow.h | 4 +- 4 files changed, 30 insertions(+), 25 deletions(-) diff --git a/src/backend/opencl/runners/OclKawPowRunner.cpp b/src/backend/opencl/runners/OclKawPowRunner.cpp index 0ec466f5..b9ba17fc 100644 --- a/src/backend/opencl/runners/OclKawPowRunner.cpp +++ b/src/backend/opencl/runners/OclKawPowRunner.cpp @@ -69,8 +69,6 @@ OclKawPowRunner::~OclKawPowRunner() delete m_calculateDagKernel; - OclLib::release(m_searchKernel); - OclLib::release(m_controlQueue); OclLib::release(m_stop); @@ -120,8 +118,7 @@ void OclKawPowRunner::run(uint32_t nonce, uint32_t *hashOutput) void OclKawPowRunner::set(const Job &job, uint8_t *blob) { m_blockHeight = static_cast(job.height()); - m_searchProgram = OclKawPow::get(*this, m_blockHeight, m_workGroupSize); - m_searchKernel = OclLib::createKernel(m_searchProgram, "progpow_search"); + m_searchKernel = OclKawPow::get(*this, m_blockHeight, m_workGroupSize); const uint32_t epoch = m_blockHeight / KPHash::EPOCH_LENGTH; diff --git a/src/backend/opencl/runners/OclKawPowRunner.h b/src/backend/opencl/runners/OclKawPowRunner.h index a4ca8015..a88414e5 100644 --- a/src/backend/opencl/runners/OclKawPowRunner.h +++ b/src/backend/opencl/runners/OclKawPowRunner.h @@ -69,7 +69,6 @@ private: KawPow_CalculateDAGKernel* m_calculateDagKernel = nullptr; - cl_program m_searchProgram = nullptr; cl_kernel m_searchKernel = nullptr; size_t m_workGroupSize = 256; diff --git a/src/backend/opencl/runners/tools/OclKawPow.cpp b/src/backend/opencl/runners/tools/OclKawPow.cpp index f2f97191..4b8274f4 100644 --- a/src/backend/opencl/runners/tools/OclKawPow.cpp +++ b/src/backend/opencl/runners/tools/OclKawPow.cpp @@ -54,8 +54,9 @@ namespace xmrig { class KawPowCacheEntry { public: - inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program) : + inline KawPowCacheEntry(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel) : program(program), + kernel(kernel), m_algo(algo), m_index(index), m_period(period), @@ -65,9 +66,10 @@ public: inline bool isExpired(uint64_t period) const { return m_period + 1 < period; } inline bool match(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index) const { return m_algo == algo && m_period == period && m_worksize == worksize && m_index == index; } inline bool match(const IOclRunner &runner, uint64_t period, uint32_t worksize) const { return match(runner.algorithm(), period, worksize, runner.deviceIndex()); } - inline void release() { OclLib::release(program); } + inline void release() { OclLib::release(kernel); OclLib::release(program); } cl_program program; + cl_kernel kernel; private: Algorithm m_algo; @@ -82,16 +84,16 @@ class KawPowCache public: KawPowCache() = default; - inline cl_program search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); } + inline cl_kernel search(const IOclRunner &runner, uint64_t period, uint32_t worksize) { return search(runner.algorithm(), period, worksize, runner.deviceIndex()); } - inline cl_program search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index) + inline cl_kernel search(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index) { std::lock_guard lock(m_mutex); for (const auto &entry : m_data) { if (entry.match(algo, period, worksize, index)) { - return entry.program; + return entry.kernel; } } @@ -99,9 +101,10 @@ public: } - void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program) + void add(const Algorithm &algo, uint64_t period, uint32_t worksize, uint32_t index, cl_program program, cl_kernel kernel) { if (search(algo, period, worksize, index)) { + OclLib::release(kernel); OclLib::release(program); return; } @@ -109,7 +112,7 @@ public: std::lock_guard lock(m_mutex); gc(period); - m_data.emplace_back(algo, period, worksize, index, program); + m_data.emplace_back(algo, period, worksize, index, program, kernel); } @@ -159,15 +162,15 @@ static KawPowCache cache; class KawPowBuilder { public: - cl_program build(const IOclRunner &runner, uint64_t period, uint32_t worksize) + cl_kernel build(const IOclRunner &runner, uint64_t period, uint32_t worksize) { std::lock_guard lock(m_mutex); const uint64_t ts = Chrono::steadyMSecs(); - cl_program program = cache.search(runner, period, worksize); - if (program) { - return program; + cl_kernel kernel = cache.search(runner, period, worksize); + if (kernel) { + return kernel; } cl_int ret; @@ -175,7 +178,7 @@ public: cl_device_id device = runner.data().device.id(); const char *s = source.c_str(); - program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret); + cl_program program = OclLib::createProgramWithSource(runner.ctx(), 1, &s, nullptr, &ret); if (ret != CL_SUCCESS) { return nullptr; } @@ -199,11 +202,17 @@ public: return nullptr; } + kernel = OclLib::createKernel(program, "progpow_search", &ret); + if (ret != CL_SUCCESS) { + OclLib::release(program); + return nullptr; + } + LOG_INFO("%s " YELLOW("KawPow") " program for period " WHITE_BOLD("%" PRIu64) " compiled " BLACK_BOLD("(%" PRIu64 "ms)"), Tags::opencl(), period, Chrono::steadyMSecs() - ts); - cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program); + cache.add(runner.algorithm(), period, worksize, runner.deviceIndex(), program, kernel); - return program; + return kernel; } @@ -382,7 +391,7 @@ public: static KawPowBuilder builder; -cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize) +cl_kernel OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t worksize) { const uint64_t period = height / KPHash::PERIOD_LENGTH; @@ -396,9 +405,9 @@ cl_program OclKawPow::get(const IOclRunner &runner, uint64_t height, uint32_t wo [](uv_work_t *req, int) { delete static_cast(req->data); } ); - cl_program program = cache.search(runner, period, worksize); - if (program) { - return program; + cl_kernel kernel = cache.search(runner, period, worksize); + if (kernel) { + return kernel; } return builder.build(runner, period, worksize); diff --git a/src/backend/opencl/runners/tools/OclKawPow.h b/src/backend/opencl/runners/tools/OclKawPow.h index 9e07d70c..8d072680 100644 --- a/src/backend/opencl/runners/tools/OclKawPow.h +++ b/src/backend/opencl/runners/tools/OclKawPow.h @@ -30,7 +30,7 @@ #include -using cl_program = struct _cl_program *; +using cl_kernel = struct _cl_kernel *; namespace xmrig { @@ -42,7 +42,7 @@ class IOclRunner; class OclKawPow { public: - static cl_program get(const IOclRunner &runner, uint64_t height, uint32_t worksize); + static cl_kernel get(const IOclRunner &runner, uint64_t height, uint32_t worksize); static void clear(); }; From 9be3b69109cb335a607e24a7c4ebc5d5c36d91a1 Mon Sep 17 00:00:00 2001 From: cohcho Date: Fri, 25 Sep 2020 14:36:06 +0000 Subject: [PATCH 04/14] soft_aes: fix previous optimization the best order of hash/fill/prefetch depends on hw/soft AES only hw AES is faster after previous optimization --- src/crypto/randomx/aes_hash.cpp | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index a15f75ad..274a9d4c 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -260,14 +260,26 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 2, fill_state2); \ rx_store_vec_i128((rx_vec_i128*)scratchpadPtr + k * 4 + 3, fill_state3); - HASH_STATE(0); - HASH_STATE(1); + switch(softAes) { + case 0: + HASH_STATE(0); + HASH_STATE(1); - FILL_STATE(0); - FILL_STATE(1); + FILL_STATE(0); + FILL_STATE(1); - rx_prefetch_t0(prefetchPtr); - rx_prefetch_t0(prefetchPtr + 64); + rx_prefetch_t0(prefetchPtr); + rx_prefetch_t0(prefetchPtr + 64); + break; + default: + HASH_STATE(0); + FILL_STATE(0); + rx_prefetch_t0(prefetchPtr); + + HASH_STATE(1); + FILL_STATE(1); + rx_prefetch_t0(prefetchPtr + 64); + } scratchpadPtr += 128; prefetchPtr += 128; From 189cc78d44c77f9f362895228592167541725130 Mon Sep 17 00:00:00 2001 From: cohcho Date: Fri, 25 Sep 2020 17:25:23 +0000 Subject: [PATCH 05/14] Miner: filter invalid algos --- src/base/crypto/Algorithm.h | 2 +- src/core/Miner.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/base/crypto/Algorithm.h b/src/base/crypto/Algorithm.h index b04765d5..186dd77e 100644 --- a/src/base/crypto/Algorithm.h +++ b/src/base/crypto/Algorithm.h @@ -96,7 +96,7 @@ public: inline bool isCN() const { auto f = family(); return f == CN || f == CN_LITE || f == CN_HEAVY || f == CN_PICO; } inline bool isEqual(const Algorithm &other) const { return m_id == other.m_id; } - inline bool isValid() const { return m_id != INVALID; } + inline bool isValid() const { return m_id != INVALID && family() != UNKNOWN; } inline const char *name() const { return name(false); } inline const char *shortName() const { return name(true); } inline Family family() const { return family(m_id); } diff --git a/src/core/Miner.cpp b/src/core/Miner.cpp index 12be05ec..f4edfa97 100644 --- a/src/core/Miner.cpp +++ b/src/core/Miner.cpp @@ -121,7 +121,7 @@ public: for (int i = 0; i < Algorithm::MAX; ++i) { const Algorithm algo(static_cast(i)); - if (isEnabled(algo)) { + if (algo.isValid() && isEnabled(algo)) { algorithms.push_back(algo); } } From f7d634894834e7c371fb8471cd04170eadf550b0 Mon Sep 17 00:00:00 2001 From: cohcho Date: Sat, 26 Sep 2020 16:41:15 +0000 Subject: [PATCH 06/14] String: distinguish nullptr/empty str --- src/base/tools/String.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/base/tools/String.cpp b/src/base/tools/String.cpp index f9322274..8187f467 100644 --- a/src/base/tools/String.cpp +++ b/src/base/tools/String.cpp @@ -33,7 +33,7 @@ xmrig::String::String(const char *str) : m_size(str == nullptr ? 0 : strlen(str)) { - if (m_size == 0) { + if (str == nullptr) { return; } From 0e9ed351a15ed6cebe80ff9f46190f0142d83583 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 27 Sep 2020 08:55:57 +0200 Subject: [PATCH 07/14] Fixed SSE4.1 for old CPUs Enable SSE4.1 only where it's needed. --- cmake/flags.cmake | 8 ++++---- cmake/randomx.cmake | 4 ++++ 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 4ff316e5..5edad339 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -29,8 +29,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES GNU) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -flax-vector-conversions") else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") add_definitions(/DHAVE_ROTR) endif() @@ -87,8 +87,8 @@ elseif (CMAKE_CXX_COMPILER_ID MATCHES Clang) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon -march=${CMAKE_SYSTEM_PROCESSOR}") else() - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -msse4.1") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes -msse4.1") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -maes") check_symbol_exists("_rotr" "x86intrin.h" HAVE_ROTR) if (HAVE_ROTR) diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index 0d3f7bab..c86aa171 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -64,6 +64,10 @@ if (WITH_RANDOMX) set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C) endif() + if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) + set_source_files_properties(src/crypto/randomx/blake2/blake2b.c PROPERTIES COMPILE_FLAGS -msse4.1) + endif() + if (CMAKE_CXX_COMPILER_ID MATCHES Clang) set_source_files_properties(src/crypto/randomx/jit_compiler_x86.cpp PROPERTIES COMPILE_FLAGS -Wno-unused-const-variable) endif() From 84f8a0dc54939a158c992944142026cc7a179c4e Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 27 Sep 2020 11:46:32 +0200 Subject: [PATCH 08/14] RandomX: isolate SSE4.1 code to fix crashes on old CPUs --- cmake/randomx.cmake | 3 +- src/crypto/randomx/blake2/blake2b.c | 69 +------------ src/crypto/randomx/blake2/blake2b_sse41.c | 112 ++++++++++++++++++++++ 3 files changed, 115 insertions(+), 69 deletions(-) create mode 100644 src/crypto/randomx/blake2/blake2b_sse41.c diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index c86aa171..19d8f481 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -19,6 +19,7 @@ if (WITH_RANDOMX) src/crypto/randomx/allocator.cpp src/crypto/randomx/blake2_generator.cpp src/crypto/randomx/blake2/blake2b.c + src/crypto/randomx/blake2/blake2b_sse41.c src/crypto/randomx/bytecode_machine.cpp src/crypto/randomx/dataset.cpp src/crypto/randomx/instructions_portable.cpp @@ -65,7 +66,7 @@ if (WITH_RANDOMX) endif() if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) - set_source_files_properties(src/crypto/randomx/blake2/blake2b.c PROPERTIES COMPILE_FLAGS -msse4.1) + set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1) endif() if (CMAKE_CXX_COMPILER_ID MATCHES Clang) diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index 7a1b9dae..6f7a73e9 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -39,40 +39,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/blake2/blake2.h" #include "crypto/randomx/blake2/blake2-impl.h" -#if defined(_M_X64) || defined(__x86_64__) - -#ifdef _MSC_VER -#include -#endif - -#include -#include "blake2b-round.h" - -#endif - static const uint64_t blake2b_IV[8] = { UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; -#if defined(_M_X64) || defined(__x86_64__) -static const uint8_t blake2b_sigma_sse41[12][16] = { - {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, - {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, - {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4}, - {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, - {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13}, - {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9}, - {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, - {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, - {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5}, - {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0}, - {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, - {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, -}; -#endif - static const uint8_t blake2b_sigma[12][16] = { {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}, {14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3}, @@ -207,46 +179,6 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t return 0; } -#if defined(_M_X64) || defined(__x86_64__) -static void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block) -{ - __m128i row1l, row1h; - __m128i row2l, row2h; - __m128i row3l, row3h; - __m128i row4l, row4h; - __m128i b0, b1; - __m128i t0, t1; - - const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); - const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); - - row1l = LOADU(&S->h[0]); - row1h = LOADU(&S->h[2]); - row2l = LOADU(&S->h[4]); - row2h = LOADU(&S->h[6]); - row3l = LOADU(&blake2b_IV[0]); - row3h = LOADU(&blake2b_IV[2]); - row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); - row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); - - const uint64_t* m = (const uint64_t*)(block); - - for (uint32_t r = 0; r < 12; ++r) { - ROUND(r); - } - - row1l = _mm_xor_si128(row3l, row1l); - row1h = _mm_xor_si128(row3h, row1h); - STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); - STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); - row2l = _mm_xor_si128(row4l, row2l); - row2h = _mm_xor_si128(row4h, row2h); - STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); - STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); -} -#undef ROUND -#endif - static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) { uint64_t m[16]; uint64_t v[16]; @@ -308,6 +240,7 @@ static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) #if defined(_M_X64) || defined(__x86_64__) uint32_t rx_blake2b_use_sse41 = 0; +void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t* block); #define rx_blake2b_compress(S, block) \ if (rx_blake2b_use_sse41) \ diff --git a/src/crypto/randomx/blake2/blake2b_sse41.c b/src/crypto/randomx/blake2/blake2b_sse41.c new file mode 100644 index 00000000..684ae0fe --- /dev/null +++ b/src/crypto/randomx/blake2/blake2b_sse41.c @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2018-2019, tevador + * Copyright 2018-2020 SChernykh + * Copyright 2016-2020 XMRig , + +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of the copyright holder nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +/* Original code from Argon2 reference source code package used under CC0 Licence + * https://github.com/P-H-C/phc-winner-argon2 + * Copyright 2015 + * Daniel Dinu, Dmitry Khovratovich, Jean-Philippe Aumasson, and Samuel Neves +*/ + +#if defined(_M_X64) || defined(__x86_64__) + +#include +#include +#include + +#include "crypto/randomx/blake2/blake2.h" + +#ifdef _MSC_VER +#include +#endif + +#include +#include "blake2b-round.h" + + +static const uint64_t blake2b_IV[8] = { + UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), + UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), + UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), + UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; + + +static const uint8_t blake2b_sigma_sse41[12][16] = { + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, + {11, 12, 5, 15, 8, 0, 2, 13, 10, 3, 7, 9, 14, 6, 1, 4}, + {7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, + {9, 5, 2, 10, 0, 7, 4, 15, 14, 11, 6, 3, 1, 12, 8, 13}, + {2, 6, 0, 8, 12, 10, 11, 3, 4, 7, 15, 1, 13, 5, 14, 9}, + {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, + {13, 7, 12, 3, 11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, + {6, 14, 11, 0, 15, 9, 3, 8, 12, 13, 1, 10, 2, 7, 4, 5}, + {10, 8, 7, 1, 2, 4, 6, 5, 15, 9, 3, 13, 11, 14, 12, 0}, + {0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13, 10, 8, 15, 6, 1, 0, 11, 5, 12, 2, 7, 3}, +}; + + +void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t *block) +{ + __m128i row1l, row1h; + __m128i row2l, row2h; + __m128i row3l, row3h; + __m128i row4l, row4h; + __m128i b0, b1; + __m128i t0, t1; + + const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); + const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); + + row1l = LOADU(&S->h[0]); + row1h = LOADU(&S->h[2]); + row2l = LOADU(&S->h[4]); + row2h = LOADU(&S->h[6]); + row3l = LOADU(&blake2b_IV[0]); + row3h = LOADU(&blake2b_IV[2]); + row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); + row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); + + const uint64_t* m = (const uint64_t*)(block); + + for (uint32_t r = 0; r < 12; ++r) { + ROUND(r); + } + + row1l = _mm_xor_si128(row3l, row1l); + row1h = _mm_xor_si128(row3h, row1h); + STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); + STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); + row2l = _mm_xor_si128(row4l, row2l); + row2h = _mm_xor_si128(row4h, row2h); + STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); + STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); +} +#endif From 3025c265e8f82b3da0afbf5753e08fe995cfe6ba Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 27 Sep 2020 11:50:08 +0200 Subject: [PATCH 09/14] RandomX: removed duplicate constatns in Blake2b --- src/crypto/randomx/blake2/blake2b.c | 2 +- src/crypto/randomx/blake2/blake2b_sse41.c | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index 6f7a73e9..b38f8b90 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -39,7 +39,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "crypto/randomx/blake2/blake2.h" #include "crypto/randomx/blake2/blake2-impl.h" -static const uint64_t blake2b_IV[8] = { +const uint64_t blake2b_IV[8] = { UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), diff --git a/src/crypto/randomx/blake2/blake2b_sse41.c b/src/crypto/randomx/blake2/blake2b_sse41.c index 684ae0fe..c8f88735 100644 --- a/src/crypto/randomx/blake2/blake2b_sse41.c +++ b/src/crypto/randomx/blake2/blake2b_sse41.c @@ -50,11 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "blake2b-round.h" -static const uint64_t blake2b_IV[8] = { - UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), - UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), - UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), - UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179) }; +extern const uint64_t blake2b_IV[8]; static const uint8_t blake2b_sigma_sse41[12][16] = { From 7b4f7681140e940f8f4d1ac8980260c2c6189ced Mon Sep 17 00:00:00 2001 From: SChernykh Date: Tue, 29 Sep 2020 21:22:11 +0200 Subject: [PATCH 10/14] RandomX: optimized soft AES code Unrolled loop was 5-10% slower depending on CPU. --- src/crypto/randomx/aes_hash.cpp | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 274a9d4c..eed82960 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -270,19 +270,22 @@ void hashAndFillAes1Rx4(void *scratchpad, size_t scratchpadSize, void *hash, voi rx_prefetch_t0(prefetchPtr); rx_prefetch_t0(prefetchPtr + 64); + + scratchpadPtr += 128; + prefetchPtr += 128; + break; + default: HASH_STATE(0); FILL_STATE(0); rx_prefetch_t0(prefetchPtr); - HASH_STATE(1); - FILL_STATE(1); - rx_prefetch_t0(prefetchPtr + 64); - } + scratchpadPtr += 64; + prefetchPtr += 64; - scratchpadPtr += 128; - prefetchPtr += 128; + break; + } } prefetchPtr = (const char*) scratchpad; scratchpadEnd += PREFETCH_DISTANCE; From 5a7bcb2d0323c7cecf635dc97328a1f86c08f69e Mon Sep 17 00:00:00 2001 From: Dusan Klinec Date: Wed, 30 Sep 2020 19:58:10 +0200 Subject: [PATCH 11/14] fies #1844, adds WITH_SSE cmake option now it is possible to disable sse for Blake2, which is not supported on ARMs --- CMakeLists.txt | 1 + cmake/randomx.cmake | 2 +- doc/build/CMAKE_OPTIONS.md | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5bacc969..53a35540 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,6 +24,7 @@ option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support ( option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON) option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF) option(WITH_PROFILING "Enable profiling for developers" OFF) +option(WITH_SSE "Enable SSE for Blake2" ON) option(BUILD_STATIC "Build static binary" OFF) option(ARM_TARGET "Force use specific ARM target 8 or 7" 0) diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index 19d8f481..edc21d8f 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -65,7 +65,7 @@ if (WITH_RANDOMX) set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C) endif() - if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) + if (NOT ARM_TARGET AND WITH_SSE AND (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)) set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1) endif() diff --git a/doc/build/CMAKE_OPTIONS.md b/doc/build/CMAKE_OPTIONS.md index c3755b58..218c24c9 100644 --- a/doc/build/CMAKE_OPTIONS.md +++ b/doc/build/CMAKE_OPTIONS.md @@ -22,6 +22,7 @@ This feature add external dependency to libhwloc (1.10.0+) (except MSVC builds). * **`-DWITH_EMBEDDED_CONFIG=ON`** Enable [embedded](https://github.com/xmrig/xmrig/issues/957) config support. * **`-DWITH_OPENCL=OFF`** Disable OpenCL backend. * **`-DWITH_CUDA=OFF`** Disable CUDA backend. +* **`-DWITH_SSE=OFF`** Disable SSE for Blake2 (useful for arm builds). ## Debug options From d45bb24a3245503920612abcb9339fbc32ad2498 Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 1 Oct 2020 11:00:08 +0700 Subject: [PATCH 12/14] Renamed WITH_SSE to WITH_SSE4_1 and make it work on all platforms. --- CMakeLists.txt | 2 +- cmake/cpu.cmake | 7 ++++++- cmake/randomx.cmake | 9 ++++++--- doc/build/CMAKE_OPTIONS.md | 2 +- src/crypto/randomx/blake2/blake2b.c | 2 +- src/crypto/rx/RxVm.cpp | 4 ++-- 6 files changed, 17 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 53a35540..f7219bc3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -24,7 +24,7 @@ option(WITH_ADL "Enable ADL (AMD Display Library) or sysfs support ( option(WITH_STRICT_CACHE "Enable strict checks for OpenCL cache" ON) option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF) option(WITH_PROFILING "Enable profiling for developers" OFF) -option(WITH_SSE "Enable SSE for Blake2" ON) +option(WITH_SSE4_1 "Enable SSE 4.1 for Blake2" ON) option(BUILD_STATIC "Build static binary" OFF) option(ARM_TARGET "Force use specific ARM target 8 or 7" 0) diff --git a/cmake/cpu.cmake b/cmake/cpu.cmake index 2fdebad8..c7d19b63 100644 --- a/cmake/cpu.cmake +++ b/cmake/cpu.cmake @@ -2,9 +2,10 @@ if (NOT CMAKE_SYSTEM_PROCESSOR) message(WARNING "CMAKE_SYSTEM_PROCESSOR not defined") endif() - if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$") add_definitions(/DRAPIDJSON_SSE2) +else() + set(WITH_SSE4_1 OFF) endif() if (NOT ARM_TARGET) @@ -41,3 +42,7 @@ if (ARM_TARGET AND ARM_TARGET GREATER 6) add_definitions(/DXMRIG_ARMv7) endif() endif() + +if (WITH_SSE4_1) + add_definitions(/DXMRIG_FEATURE_SSE4_1) +endif() diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index edc21d8f..a554991b 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -19,7 +19,6 @@ if (WITH_RANDOMX) src/crypto/randomx/allocator.cpp src/crypto/randomx/blake2_generator.cpp src/crypto/randomx/blake2/blake2b.c - src/crypto/randomx/blake2/blake2b_sse41.c src/crypto/randomx/bytecode_machine.cpp src/crypto/randomx/dataset.cpp src/crypto/randomx/instructions_portable.cpp @@ -65,8 +64,12 @@ if (WITH_RANDOMX) set_property(SOURCE src/crypto/randomx/jit_compiler_a64_static.S PROPERTY LANGUAGE C) endif() - if (NOT ARM_TARGET AND WITH_SSE AND (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang)) - set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1) + if (WITH_SSE4_1) + list(APPEND SOURCES_CRYPTO src/crypto/randomx/blake2/blake2b_sse41.c) + + if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) + set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1) + endif() endif() if (CMAKE_CXX_COMPILER_ID MATCHES Clang) diff --git a/doc/build/CMAKE_OPTIONS.md b/doc/build/CMAKE_OPTIONS.md index 218c24c9..81e3914f 100644 --- a/doc/build/CMAKE_OPTIONS.md +++ b/doc/build/CMAKE_OPTIONS.md @@ -22,7 +22,7 @@ This feature add external dependency to libhwloc (1.10.0+) (except MSVC builds). * **`-DWITH_EMBEDDED_CONFIG=ON`** Enable [embedded](https://github.com/xmrig/xmrig/issues/957) config support. * **`-DWITH_OPENCL=OFF`** Disable OpenCL backend. * **`-DWITH_CUDA=OFF`** Disable CUDA backend. -* **`-DWITH_SSE=OFF`** Disable SSE for Blake2 (useful for arm builds). +* **`-DWITH_SSE4_1=OFF`** Disable SSE 4.1 for Blake2 (useful for arm builds). ## Debug options diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index b38f8b90..49329e46 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -237,7 +237,7 @@ static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) #undef ROUND } -#if defined(_M_X64) || defined(__x86_64__) +#if defined(XMRIG_FEATURE_SSE4_1) uint32_t rx_blake2b_use_sse41 = 0; void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t* block); diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp index 8879eef3..16abf11a 100644 --- a/src/crypto/rx/RxVm.cpp +++ b/src/crypto/rx/RxVm.cpp @@ -31,7 +31,7 @@ #include "crypto/rx/RxVm.h" -#if defined(_M_X64) || defined(__x86_64__) +#if defined(XMRIG_FEATURE_SSE4_1) extern "C" uint32_t rx_blake2b_use_sse41; #endif @@ -60,7 +60,7 @@ randomx_vm* xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so flags |= RANDOMX_FLAG_AMD; } -# if defined(_M_X64) || defined(__x86_64__) +# if defined(XMRIG_FEATURE_SSE4_1) rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0; # endif From 1b4a124bc5b03c4c0d5755e6f7b71991cf7142cb Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 1 Oct 2020 17:46:05 +0700 Subject: [PATCH 13/14] Fix x86 build. --- cmake/cpu.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/cpu.cmake b/cmake/cpu.cmake index c7d19b63..8843133b 100644 --- a/cmake/cpu.cmake +++ b/cmake/cpu.cmake @@ -2,7 +2,7 @@ if (NOT CMAKE_SYSTEM_PROCESSOR) message(WARNING "CMAKE_SYSTEM_PROCESSOR not defined") endif() -if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$") +if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$" AND CMAKE_SIZEOF_VOID_P EQUAL 8) add_definitions(/DRAPIDJSON_SSE2) else() set(WITH_SSE4_1 OFF) From a89c2c8dd1391829cd74152c688c388f39f473a2 Mon Sep 17 00:00:00 2001 From: xmrig Date: Fri, 2 Oct 2020 22:39:26 +0700 Subject: [PATCH 14/14] Update CHANGELOG.md --- CHANGELOG.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 887ba895..bf18ade1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,10 @@ +# v6.3.5 +- [#1845](https://github.com/xmrig/xmrig/pull/1845) [#1861](https://github.com/xmrig/xmrig/pull/1861) Fixed ARM build and added CMake option `WITH_SSE4_1`. +- [#1846](https://github.com/xmrig/xmrig/pull/1846) KawPow: fixed OpenCL memory leak. +- [#1849](https://github.com/xmrig/xmrig/pull/1849) [#1859](https://github.com/xmrig/xmrig/pull/1859) RandomX: optimized soft AES code. +- [#1850](https://github.com/xmrig/xmrig/pull/1850) [#1852](https://github.com/xmrig/xmrig/pull/1852) General code improvements. +- [#1853](https://github.com/xmrig/xmrig/issues/1853) [#1856](https://github.com/xmrig/xmrig/pull/1856) [#1857](https://github.com/xmrig/xmrig/pull/1857) Fixed crash on old CPUs. + # v6.3.4 - [#1823](https://github.com/xmrig/xmrig/pull/1823) RandomX: added new option `scratchpad_prefetch_mode`. - [#1827](https://github.com/xmrig/xmrig/pull/1827) [#1831](https://github.com/xmrig/xmrig/pull/1831) Improved nonce iteration performance.