diff --git a/src/base/net/Pool.cpp b/src/base/net/Pool.cpp index 8a4d0a0a..a1fd4331 100644 --- a/src/base/net/Pool.cpp +++ b/src/base/net/Pool.cpp @@ -484,6 +484,7 @@ void xmrig::Pool::rebuild() m_algorithms.push_back(m_algorithm); # ifndef XMRIG_PROXY_PROJECT + addVariant(VARIANT_RWZ); addVariant(VARIANT_4); addVariant(VARIANT_WOW); addVariant(VARIANT_2); diff --git a/src/common/crypto/Algorithm.cpp b/src/common/crypto/Algorithm.cpp index 7d03acfc..ae7ba7c7 100644 --- a/src/common/crypto/Algorithm.cpp +++ b/src/common/crypto/Algorithm.cpp @@ -66,6 +66,7 @@ static AlgoData const algorithms[] = { { "cryptonight/xtlv9", "cn/xtlv9", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF }, { "cryptonight/wow", "cn/wow", xmrig::CRYPTONIGHT, xmrig::VARIANT_WOW }, { "cryptonight/r", "cn/r", xmrig::CRYPTONIGHT, xmrig::VARIANT_4 }, + { "cryptonight/rwz", "cn/rwz", xmrig::CRYPTONIGHT, xmrig::VARIANT_RWZ }, # ifndef XMRIG_NO_AEON { "cryptonight-lite", "cn-lite", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO }, @@ -133,6 +134,7 @@ static const char *variants[] = { "gpu", "wow", "r", + "rwz" }; diff --git a/src/common/xmrig.h b/src/common/xmrig.h index c6a5f568..575251b7 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -76,6 +76,7 @@ enum Variant { VARIANT_GPU = 11, // CryptoNight-GPU (Ryo) VARIANT_WOW = 12, // CryptoNightR (Wownero) VARIANT_4 = 13, // CryptoNightR (Monero's variant 4) + VARIANT_RWZ = 14, // CryptoNight variant 2 with 3/4 iterations and reversed shuffle operation (Graft) VARIANT_MAX }; diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index e7232eb1..d762929c 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -436,7 +436,7 @@ static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m1 uint64_t* mem_out = (uint64_t*)&l[idx]; if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx); + VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx)); } else { __m128i tmp = _mm_xor_si128(bx0, cx); @@ -530,9 +530,9 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si if (BASE == xmrig::VARIANT_2) { if (VARIANT == xmrig::VARIANT_4) { - VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx); + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx, 0); } else { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); } } @@ -709,9 +709,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si if (BASE == xmrig::VARIANT_2) { if (VARIANT == xmrig::VARIANT_4) { - VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0); + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0, 0); } else { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); } } @@ -767,9 +767,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si if (BASE == xmrig::VARIANT_2) { if (VARIANT == xmrig::VARIANT_4) { - VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1); + VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1, 0); } else { - VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); + VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); } } diff --git a/src/crypto/CryptoNight_constants.h b/src/crypto/CryptoNight_constants.h index 6afedc90..a6623eba 100644 --- a/src/crypto/CryptoNight_constants.h +++ b/src/crypto/CryptoNight_constants.h @@ -42,6 +42,7 @@ constexpr const uint32_t CRYPTONIGHT_MASK = 0x1FFFF0; constexpr const uint32_t CRYPTONIGHT_ITER = 0x80000; constexpr const uint32_t CRYPTONIGHT_HALF_ITER = 0x40000; constexpr const uint32_t CRYPTONIGHT_XAO_ITER = 0x100000; +constexpr const uint32_t CRYPTONIGHT_WALTZ_ITER = 0x60000; constexpr const uint32_t CRYPTONIGHT_GPU_ITER = 0xC000; constexpr const uint32_t CRYPTONIGHT_GPU_MASK = 0x1FFFC0; @@ -134,6 +135,7 @@ template<> inline constexpr uint32_t cn_select_iter() template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_XAO_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_GPU_ITER; } +template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_WALTZ_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_LITE_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_LITE_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HEAVY_ITER; } @@ -158,6 +160,9 @@ inline uint32_t cn_select_iter(Algo algorithm, Variant variant) case VARIANT_TRTL: return CRYPTONIGHT_TRTL_ITER; + case VARIANT_RWZ: + return CRYPTONIGHT_WALTZ_ITER; + default: break; } @@ -199,6 +204,7 @@ template<> inline constexpr Variant cn_base_variant() { return VA template<> inline constexpr Variant cn_base_variant() { return VARIANT_GPU; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } +template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } template inline constexpr bool cn_is_cryptonight_r() { return false; } diff --git a/src/crypto/CryptoNight_monero.h b/src/crypto/CryptoNight_monero.h index 4cfff0f8..4e84ac5d 100644 --- a/src/crypto/CryptoNight_monero.h +++ b/src/crypto/CryptoNight_monero.h @@ -83,11 +83,11 @@ sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \ } while (0) -# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \ +# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c, reverse) \ do { \ - const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ (reverse ? 0x30 : 0x10)))); \ const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ - const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ (reverse ? 0x10 : 0x30)))); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ @@ -96,15 +96,20 @@ } \ } while (0) -# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ +# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo, reverse) \ do { \ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \ lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \ const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ - _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ - _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ + if (reverse) { \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk1, _b1)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk3, _b)); \ + } else { \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ + } \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ } while (0) @@ -128,11 +133,11 @@ sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \ } while (0) -# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c) \ +# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1, _c, reverse) \ do { \ - const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))); \ + const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ (reverse ? 0x30 : 0x10)))); \ const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \ - const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \ + const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ (reverse ? 0x10 : 0x30)))); \ vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ @@ -141,15 +146,20 @@ } \ } while (0) -# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ +# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo, reverse) \ do { \ const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \ const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \ hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \ lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \ const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ + if (reverse) { \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b1))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b))); \ + } else { \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ + } \ vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ } while (0) #endif diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 7480fae5..4f10b221 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -230,6 +230,18 @@ const static uint8_t test_output_rto[160] = { 0xE7, 0x81, 0x4E, 0x2A, 0xBD, 0x62, 0xC1, 0x1B, 0x7C, 0xB9, 0x33, 0x7B, 0xEE, 0x95, 0x80, 0xB3 }; +const static uint8_t test_output_rwz[160] = { + 0x5f, 0x56, 0xc6, 0xb0, 0x99, 0x6b, 0xa2, 0x3e, 0x0b, 0xba, 0x07, 0x29, 0xc9, 0x90, 0x74, 0x85, + 0x5a, 0x10, 0xe3, 0x08, 0x7f, 0xdb, 0xfe, 0x94, 0x75, 0x33, 0x54, 0x73, 0x76, 0xf0, 0x75, 0xb8, + 0x8b, 0x70, 0x43, 0x9a, 0xfc, 0xf5, 0xeb, 0x15, 0xbb, 0xf9, 0xad, 0x9d, 0x2a, 0xbd, 0x72, 0x52, + 0x49, 0x54, 0x0b, 0x91, 0xea, 0x61, 0x7f, 0x98, 0x7d, 0x39, 0x17, 0xb7, 0xd7, 0x65, 0xff, 0x75, + 0x13, 0x21, 0x1d, 0xce, 0x61, 0x5a, 0xdc, 0x5f, 0x8c, 0xcb, 0x1f, 0x6f, 0xbb, 0x92, 0x88, 0xc3, + 0xe3, 0xe2, 0xfc, 0x4f, 0x62, 0xfb, 0xf0, 0x48, 0x02, 0x01, 0xd3, 0xbe, 0x77, 0x6a, 0x40, 0xca, + 0x9a, 0xe9, 0xba, 0x0c, 0xc0, 0x2b, 0x11, 0xf6, 0x9b, 0xee, 0x24, 0x3a, 0xd8, 0x86, 0x18, 0xd0, + 0xe8, 0xeb, 0xcb, 0x38, 0x2c, 0xf5, 0x99, 0x83, 0x14, 0x7b, 0x0c, 0x20, 0xbe, 0x50, 0xf4, 0x87, + 0x83, 0x41, 0x75, 0xd8, 0xd1, 0xdd, 0x4b, 0x73, 0xb3, 0x92, 0x8f, 0xe6, 0x1c, 0x72, 0x70, 0xf5, + 0x7c, 0xf6, 0x23, 0x3a, 0xb4, 0x5f, 0xdf, 0xde, 0xa6, 0x5a, 0x58, 0xec, 0x13, 0x5a, 0x23, 0x2f, +}; #ifndef XMRIG_NO_AEON // "cn-lite/0" diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index e53a4dbe..5c4de441 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -531,7 +531,7 @@ template static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i& cx) { if (BASE == xmrig::VARIANT_2) { - VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx); + VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1, cx, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx)); } else { __m128i tmp = _mm_xor_si128(bx0, cx); @@ -653,9 +653,9 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si if (BASE == xmrig::VARIANT_2) { if (VARIANT == xmrig::VARIANT_4) { - VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx); + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx0, bx1, cx, 0); } else { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); } } @@ -754,6 +754,8 @@ extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); +extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm; extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm; @@ -843,6 +845,9 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_ cn_trtl_mainloop_bulldozer_asm(ctx[0]); } } + else if (VARIANT == xmrig::VARIANT_RWZ) { + cnv2_rwz_mainloop_asm(ctx[0]); + } else if (xmrig::cn_is_cryptonight_r()) { ctx[0]->generated_code(ctx[0]); } @@ -881,6 +886,9 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ else if (VARIANT == xmrig::VARIANT_TRTL) { cn_trtl_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); } + else if (VARIANT == xmrig::VARIANT_RWZ) { + cnv2_rwz_double_mainloop_asm(ctx[0], ctx[1]); + } else if (xmrig::cn_is_cryptonight_r()) { ctx[0]->generated_code_double(ctx[0], ctx[1]); } @@ -995,9 +1003,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si if (BASE == xmrig::VARIANT_2) { if (VARIANT == xmrig::VARIANT_4) { - VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0); + VARIANT2_SHUFFLE(l0, idx0 & MASK, ax0, bx00, bx01, cx0, 0); } else { - VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); + VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); } } @@ -1051,9 +1059,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si if (BASE == xmrig::VARIANT_2) { if (VARIANT == xmrig::VARIANT_4) { - VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1); + VARIANT2_SHUFFLE(l1, idx1 & MASK, ax1, bx10, bx11, cx1, 0); } else { - VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); + VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); } } @@ -1155,9 +1163,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si lo = __umul128(idx, cl##part, &hi); \ if (BASE == xmrig::VARIANT_2) { \ if (VARIANT == xmrig::VARIANT_4) { \ - VARIANT2_SHUFFLE(l, idx & MASK, a, b0, b1, c); \ + VARIANT2_SHUFFLE(l, idx & MASK, a, b0, b1, c, 0); \ } else { \ - VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo); \ + VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo, (VARIANT == xmrig::VARIANT_RWZ ? 1 : 0)); \ } \ } \ if (VARIANT == xmrig::VARIANT_4) { \ diff --git a/src/crypto/asm/cn2/cnv2_rwz_double_main_loop.inc b/src/crypto/asm/cn2/cnv2_rwz_double_main_loop.inc new file mode 100644 index 00000000..d2d87173 --- /dev/null +++ b/src/crypto/asm/cn2/cnv2_rwz_double_main_loop.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 393216 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN(64) +rwz_main_loop_double: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rax+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm3 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r8+r13], xmm0 + xor r8d, 32 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r11+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js rwz_div_fix_1 +rwz_div_fix_1_ret: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js rwz_div_fix_2 +rwz_div_fix_2_ret: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je rwz_sqrt_fix_1 +rwz_sqrt_fix_1_ret: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je rwz_sqrt_fix_2 +rwz_sqrt_fix_2_ret: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm3, xmm6 + paddq xmm1, xmm11 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm3 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm0 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne rwz_main_loop_double + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp rwz_cnv2_double_mainloop_asm_endp + +rwz_div_fix_1: + dec rbx + add r11, rdx + jmp rwz_div_fix_1_ret + +rwz_div_fix_2: + dec rdx + add r8, r9 + jmp rwz_div_fix_2_ret + +rwz_sqrt_fix_1: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_1_ret + +rwz_sqrt_fix_2: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_2_ret + +rwz_cnv2_double_mainloop_asm_endp: diff --git a/src/crypto/asm/cn2/cnv2_rwz_main_loop.inc b/src/crypto/asm/cn2/cnv2_rwz_main_loop.inc new file mode 100644 index 00000000..021f787e --- /dev/null +++ b/src/crypto/asm/cn2/cnv2_rwz_main_loop.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 393216 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN(64) +rwz_main_loop: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm0, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm2, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + paddq xmm1, xmm7 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movq rdx, xmm3 + test edx, 524287 + je rwz_sqrt_fixup + psrlq xmm3, 19 +rwz_sqrt_fixup_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm4 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm5 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm2 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne rwz_main_loop + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_rwz_main_loop_endp + +rwz_sqrt_fixup: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp rwz_sqrt_fixup_ret + +cnv2_rwz_main_loop_endp: diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index a792337f..c2c08739 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -15,6 +15,8 @@ .global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cnv2_rwz_mainloop_asm) +.global FN_PREFIX(cnv2_rwz_double_mainloop_asm) ALIGN(64) FN_PREFIX(cnv2_mainloop_ivybridge_asm): @@ -52,3 +54,21 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): add rsp, 48 ret 0 mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv2_rwz_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn2/cnv2_rwz_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv2_rwz_double_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn2/cnv2_rwz_double_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm index f1384be8..f0766a7c 100644 --- a/src/crypto/asm/cn_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -3,6 +3,8 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm +PUBLIC cnv2_rwz_mainloop_asm +PUBLIC cnv2_rwz_double_mainloop_asm ALIGN(64) cnv2_mainloop_ivybridge_asm PROC @@ -32,5 +34,19 @@ cnv2_double_mainloop_sandybridge_asm PROC mov eax, 3735929054 cnv2_double_mainloop_sandybridge_asm ENDP +ALIGN(64) +cnv2_rwz_mainloop_asm PROC + INCLUDE cn2/cnv2_rwz_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv2_rwz_mainloop_asm ENDP + +ALIGN(64) +cnv2_rwz_double_mainloop_asm PROC + INCLUDE cn2/cnv2_rwz_double_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv2_rwz_double_mainloop_asm ENDP + _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/crypto/asm/win64/cn2/cnv2_rwz_double_main_loop.inc b/src/crypto/asm/win64/cn2/cnv2_rwz_double_main_loop.inc new file mode 100644 index 00000000..69ca8793 --- /dev/null +++ b/src/crypto/asm/win64/cn2/cnv2_rwz_double_main_loop.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 393216 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movd xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movd xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movd xmm5, QWORD PTR [r8+104] + movd xmm7, rax + + mov eax, 1 + shl rax, 52 + movd xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movd xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movd xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movd xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movd xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movd xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN(64) +rwz_main_loop_double: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movd xmm0, r11 + movd xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movd r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movd xmm0, rbp + movd xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rax+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movd rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movd rdx, xmm5 + shl rdx, 32 + movd rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movd xmm0, rdx + xor rdx, [r11+r13] + movd xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm3 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r8+r13], xmm0 + xor r8d, 32 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r11+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movd r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movd r11, xmm0 + psrldq xmm1, 8 + movd r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movd rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movd rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movd r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js rwz_div_fix_1 +rwz_div_fix_1_ret: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js rwz_div_fix_2 +rwz_div_fix_2_ret: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movd r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je rwz_sqrt_fix_1 +rwz_sqrt_fix_1_ret: + + movd r9, xmm10 + psrldq xmm1, 8 + movd r8, xmm1 + test r8, 524287 + je rwz_sqrt_fix_2 +rwz_sqrt_fix_2_ret: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm3, xmm6 + paddq xmm1, xmm11 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm3 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm0 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne rwz_main_loop_double + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp rwz_cnv2_double_mainloop_asm_endp + +rwz_div_fix_1: + dec rbx + add r11, rdx + jmp rwz_div_fix_1_ret + +rwz_div_fix_2: + dec rdx + add r8, r9 + jmp rwz_div_fix_2_ret + +rwz_sqrt_fix_1: + movd r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movd xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_1_ret + +rwz_sqrt_fix_2: + psrldq xmm3, 8 + movd r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movd xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_2_ret + +rwz_cnv2_double_mainloop_asm_endp: diff --git a/src/crypto/asm/win64/cn2/cnv2_rwz_main_loop.inc b/src/crypto/asm/win64/cn2/cnv2_rwz_main_loop.inc new file mode 100644 index 00000000..99317730 --- /dev/null +++ b/src/crypto/asm/win64/cn2/cnv2_rwz_main_loop.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 393216 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movd xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movd xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movd xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN(64) +rwz_main_loop: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movd xmm0, r11 + movd xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movd rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm0, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm2, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + paddq xmm1, xmm7 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movd rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movd rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movd rdx, xmm3 + test edx, 524287 + je rwz_sqrt_fixup + psrlq xmm3, 19 +rwz_sqrt_fixup_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movd xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movd xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm4 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm5 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm2 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne rwz_main_loop + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_rwz_main_loop_endp + +rwz_sqrt_fixup: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movd xmm3, rdx + jmp rwz_sqrt_fixup_ret + +cnv2_rwz_main_loop_endp: diff --git a/src/crypto/asm/win64/cn_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S index 1200c4df..3c4dc020 100644 --- a/src/crypto/asm/win64/cn_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -5,6 +5,8 @@ .global cnv2_mainloop_ryzen_asm .global cnv2_mainloop_bulldozer_asm .global cnv2_double_mainloop_sandybridge_asm +.global FN_PREFIX(cnv2_rwz_mainloop_asm) +.global FN_PREFIX(cnv2_rwz_double_mainloop_asm) ALIGN(64) cnv2_mainloop_ivybridge_asm: @@ -29,3 +31,21 @@ cnv2_double_mainloop_sandybridge_asm: #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv2_rwz_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn2/cnv2_rwz_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv2_rwz_double_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn2/cnv2_rwz_double_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 diff --git a/src/crypto/asm/win64/cn_main_loop.asm b/src/crypto/asm/win64/cn_main_loop.asm index 846b860c..57246cf5 100644 --- a/src/crypto/asm/win64/cn_main_loop.asm +++ b/src/crypto/asm/win64/cn_main_loop.asm @@ -3,6 +3,8 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm +PUBLIC cnv2_rwz_mainloop_asm +PUBLIC cnv2_rwz_double_mainloop_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC @@ -32,5 +34,19 @@ cnv2_double_mainloop_sandybridge_asm PROC mov eax, 3735929054 cnv2_double_mainloop_sandybridge_asm ENDP +ALIGN(64) +cnv2_rwz_mainloop_asm PROC + INCLUDE cn2/cnv2_rwz_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv2_rwz_mainloop_asm ENDP + +ALIGN(64) +cnv2_rwz_double_mainloop_asm PROC + INCLUDE cn2/cnv2_rwz_double_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv2_rwz_double_mainloop_asm ENDP + _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 6065bc3a..dfa6518c 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -180,6 +180,8 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a add_asm_func(asm_func_map); # endif + add_asm_func(asm_func_map); + asm_func_map_initialized = true; } @@ -323,6 +325,17 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a cryptonight_quad_hash, cryptonight_penta_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + # ifndef XMRIG_NO_AEON cryptonight_single_hash, cryptonight_double_hash, @@ -358,6 +371,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RWZ # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1 @@ -373,6 +387,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RWZ # endif # ifndef XMRIG_NO_SUMO @@ -422,6 +437,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RWZ # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1 @@ -437,6 +453,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RWZ # endif # ifndef XMRIG_NO_CN_PICO @@ -465,6 +482,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RWZ # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_0 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_1 @@ -480,6 +498,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_GPU nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_WOW nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_4 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RWZ # endif }; diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index 1d1c7519..1bc52bf4 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -64,7 +64,8 @@ bool MultiWorker::selfTest() verify(VARIANT_RTO, test_output_rto) && verify(VARIANT_HALF, test_output_half) && verify2(VARIANT_WOW, test_output_wow) && - verify2(VARIANT_4, test_output_r); + verify2(VARIANT_4, test_output_r) && + verify(VARIANT_RWZ, test_output_rwz); # ifndef XMRIG_NO_CN_GPU if (!rc || N > 1) {