diff --git a/CHANGELOG.md b/CHANGELOG.md index e0d97b6d..6242bbee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,22 @@ +# v6.16.2 +- [#2751](https://github.com/xmrig/xmrig/pull/2751) Fixed crash on CPUs supporting VAES and running GCC-compiled xmrig. +- [#2761](https://github.com/xmrig/xmrig/pull/2761) Fixed broken auto-tuning in GCC Windows build. +- [#2771](https://github.com/xmrig/xmrig/issues/2771) Fixed environment variables support for GhostRider and KawPow. +- [#2769](https://github.com/xmrig/xmrig/pull/2769) Performance fixes: + - Fixed several performance bottlenecks introduced in v6.16.1. + - Fixed overall GCC-compiled build performance, it's the same speed as MSVC build now. + - **Linux builds are up to 10% faster now compared to v6.16.0 GCC build.** + - **Windows builds are up to 5% faster now compared to v6.16.0 MSVC build.** + # v6.16.1 - [#2729](https://github.com/xmrig/xmrig/pull/2729) GhostRider fixes: - - Added average hashrate display - - Fixed the number of threads shown at startup - - Fixed `--threads` or `-t` command line option (but `--cpu-max-threads-hint` is recommended to use) + - Added average hashrate display. + - Fixed the number of threads shown at startup. + - Fixed `--threads` or `-t` command line option (but `--cpu-max-threads-hint` is recommended to use). - [#2738](https://github.com/xmrig/xmrig/pull/2738) GhostRider fixes: - - Fixed "difficulty is not a number" error when diff is high on some pools - - Fixed GhostRider compilation when WITH_KAWPOW=OFF -- [#2740](https://github.com/xmrig/xmrig/pull/2740) Added VAES support for Cryptonight variants **+4% speedup on Zen3** + - Fixed "difficulty is not a number" error when diff is high on some pools. + - Fixed GhostRider compilation when `WITH_KAWPOW=OFF`. +- [#2740](https://github.com/xmrig/xmrig/pull/2740) Added VAES support for Cryptonight variants **+4% speedup on Zen3**. - VAES instructions are available on Intel Ice Lake/AMD Zen3 and newer CPUs. - +4% speedup on Ryzen 5 5600X. diff --git a/src/backend/cpu/CpuWorker.h b/src/backend/cpu/CpuWorker.h index a15378ed..ade256ec 100644 --- a/src/backend/cpu/CpuWorker.h +++ b/src/backend/cpu/CpuWorker.h @@ -55,7 +55,7 @@ public: size_t threads() const override { # ifdef XMRIG_ALGO_GHOSTRIDER - return m_ghHelper ? 2 : 1; + return ((m_algorithm.family() == Algorithm::GHOSTRIDER) && m_ghHelper) ? 2 : 1; # else return 1; # endif diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index d64612e5..81d9395e 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -30,6 +30,12 @@ #endif +#include "crypto/cn/CryptoNight_monero.h" +#ifdef XMRIG_VAES +# include "crypto/cn/CryptoNight_x86_vaes.h" +#endif + + #include "backend/cpu/platform/BasicCpuInfo.h" #include "3rdparty/rapidjson/document.h" #include "crypto/common/Assembly.h" @@ -140,7 +146,7 @@ static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); } static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); } static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); } -static inline bool has_vaes() { return has_feature(EXTENDED_FEATURES, ECX_Reg, 1 << 9); } +static inline bool has_vaes() { return has_feature(EXTENDED_FEATURES, ECX_Reg, 1 << 9) && has_osxsave() && has_xcr_avx(); } static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); } static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); } static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); } @@ -294,6 +300,9 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : } } # endif + + cn_sse41_enabled = has(FLAG_SSE41); + cn_vaes_enabled = has(FLAG_VAES); } diff --git a/src/base/base.cmake b/src/base/base.cmake index 5761a09b..db26e2e9 100644 --- a/src/base/base.cmake +++ b/src/base/base.cmake @@ -132,6 +132,7 @@ set(SOURCES_BASE src/base/net/tools/LineReader.cpp src/base/net/tools/NetBuffer.cpp src/base/tools/Arguments.cpp + src/base/tools/Chrono.cpp src/base/tools/cryptonote/BlockTemplate.cpp src/base/tools/cryptonote/crypto-ops-data.c src/base/tools/cryptonote/crypto-ops.c diff --git a/src/base/net/stratum/EthStratumClient.cpp b/src/base/net/stratum/EthStratumClient.cpp index 3ebce708..fd877d72 100644 --- a/src/base/net/stratum/EthStratumClient.cpp +++ b/src/base/net/stratum/EthStratumClient.cpp @@ -73,7 +73,7 @@ int64_t xmrig::EthStratumClient::submit(const JobResult& result) auto& allocator = doc.GetAllocator(); Value params(kArrayType); - params.PushBack(m_pool.user().toJSON(), allocator); + params.PushBack(m_user.toJSON(), allocator); params.PushBack(result.jobId.toJSON(), allocator); # ifdef XMRIG_ALGO_GHOSTRIDER @@ -471,8 +471,8 @@ void xmrig::EthStratumClient::authorize() auto &allocator = doc.GetAllocator(); Value params(kArrayType); - params.PushBack(m_pool.user().toJSON(), allocator); - params.PushBack(m_pool.password().toJSON(), allocator); + params.PushBack(m_user.toJSON(), allocator); + params.PushBack(m_password.toJSON(), allocator); JsonRequest::create(doc, m_sequence, "mining.authorize", params); diff --git a/src/base/net/stratum/benchmark/BenchClient.cpp b/src/base/net/stratum/benchmark/BenchClient.cpp index 80db4747..a9459e1f 100644 --- a/src/base/net/stratum/benchmark/BenchClient.cpp +++ b/src/base/net/stratum/benchmark/BenchClient.cpp @@ -50,6 +50,7 @@ xmrig::BenchClient::BenchClient(const std::shared_ptr &benchmark, I # ifdef XMRIG_ALGO_GHOSTRIDER if (m_benchmark->algorithm() == Algorithm::GHOSTRIDER_RTM) { + const uint32_t q = (benchmark->rotation() / 20) & 1; const uint32_t r = benchmark->rotation() % 20; static constexpr uint32_t indices[20][3] = { @@ -75,9 +76,9 @@ xmrig::BenchClient::BenchClient(const std::shared_ptr &benchmark, I { 3, 4, 5 }, }; - blob[ 8] = '0' + indices[r][1]; + blob[ 8] = '0' + indices[r][q ? 2 : 1]; blob[ 9] = '0' + indices[r][0]; - blob[11] = '0' + indices[r][2]; + blob[11] = '0' + indices[r][q ? 1 : 2]; } # endif diff --git a/src/base/tools/Chrono.cpp b/src/base/tools/Chrono.cpp new file mode 100644 index 00000000..5697472c --- /dev/null +++ b/src/base/tools/Chrono.cpp @@ -0,0 +1,44 @@ +/* XMRig + * Copyright (c) 2018-2021 SChernykh + * Copyright (c) 2016-2021 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include "Chrono.h" + + +#ifdef XMRIG_OS_WIN +# include +#endif + + +namespace xmrig { + + +double Chrono::highResolutionMSecs() +{ +# ifdef XMRIG_OS_WIN + LARGE_INTEGER f, t; + QueryPerformanceFrequency(&f); + QueryPerformanceCounter(&t); + return static_cast(t.QuadPart) * 1e3 / f.QuadPart; +# else + using namespace std::chrono; + return static_cast(duration_cast(high_resolution_clock::now().time_since_epoch()).count()) / 1e6; +# endif +} + + +} /* namespace xmrig */ diff --git a/src/base/tools/Chrono.h b/src/base/tools/Chrono.h index 78da18c1..65c3d5ae 100644 --- a/src/base/tools/Chrono.h +++ b/src/base/tools/Chrono.h @@ -29,12 +29,7 @@ namespace xmrig { class Chrono { public: - static inline uint64_t highResolutionMSecs() - { - using namespace std::chrono; - - return static_cast(time_point_cast(high_resolution_clock::now()).time_since_epoch().count()); - } + static double highResolutionMSecs(); static inline uint64_t steadyMSecs() diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp index 0df12bca..a1ef9015 100644 --- a/src/crypto/cn/CnHash.cpp +++ b/src/crypto/cn/CnHash.cpp @@ -55,6 +55,10 @@ } while (0) +bool cn_sse41_enabled = false; +bool cn_vaes_enabled = false; + + #ifdef XMRIG_FEATURE_ASM # define ADD_FN_ASM(algo) do { \ m_map[algo]->data[AV_SINGLE][Assembly::INTEL] = cryptonight_single_hash_asm; \ @@ -97,6 +101,27 @@ cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm = nullptr; cn_mainloop_fun cn_upx2_mainloop_asm = nullptr; cn_mainloop_fun cn_upx2_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr0_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr1_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr2_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr3_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr4_single_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr5_single_mainloop_asm = nullptr; + +cn_mainloop_fun cn_gr0_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr1_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr2_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr3_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr4_double_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr5_double_mainloop_asm = nullptr; + +cn_mainloop_fun cn_gr0_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr1_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr2_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr3_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr4_quad_mainloop_asm = nullptr; +cn_mainloop_fun cn_gr5_quad_mainloop_asm = nullptr; + template static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask = CnAlgo().mask()) @@ -136,7 +161,7 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma static void patchAsmVariants() { - const int allocation_size = 131072; + constexpr size_t allocation_size = 0x20000; auto base = static_cast(VirtualMemory::allocateExecutableMemory(allocation_size, false)); cn_half_mainloop_ivybridge_asm = reinterpret_cast (base + 0x0000); @@ -173,6 +198,29 @@ static void patchAsmVariants() cn_upx2_double_mainloop_asm = reinterpret_cast (base + 0x15000); # endif +# ifdef XMRIG_ALGO_GHOSTRIDER + cn_gr0_single_mainloop_asm = reinterpret_cast (base + 0x16000); + cn_gr1_single_mainloop_asm = reinterpret_cast (base + 0x16800); + cn_gr2_single_mainloop_asm = reinterpret_cast (base + 0x17000); + cn_gr3_single_mainloop_asm = reinterpret_cast (base + 0x17800); + cn_gr4_single_mainloop_asm = reinterpret_cast (base + 0x18000); + cn_gr5_single_mainloop_asm = reinterpret_cast (base + 0x18800); + + cn_gr0_double_mainloop_asm = reinterpret_cast (base + 0x19000); + cn_gr1_double_mainloop_asm = reinterpret_cast (base + 0x19800); + cn_gr2_double_mainloop_asm = reinterpret_cast (base + 0x1A000); + cn_gr3_double_mainloop_asm = reinterpret_cast (base + 0x1A800); + cn_gr4_double_mainloop_asm = reinterpret_cast (base + 0x1B000); + cn_gr5_double_mainloop_asm = reinterpret_cast (base + 0x1B800); + + cn_gr0_quad_mainloop_asm = reinterpret_cast (base + 0x1C000); + cn_gr1_quad_mainloop_asm = reinterpret_cast (base + 0x1C800); + cn_gr2_quad_mainloop_asm = reinterpret_cast (base + 0x1D000); + cn_gr3_quad_mainloop_asm = reinterpret_cast (base + 0x1D800); + cn_gr4_quad_mainloop_asm = reinterpret_cast (base + 0x1E000); + cn_gr5_quad_mainloop_asm = reinterpret_cast (base + 0x1E800); +# endif + { constexpr uint32_t ITER = CnAlgo().iterations(); @@ -230,7 +278,30 @@ static void patchAsmVariants() patchCode(cn_upx2_mainloop_asm, cnv2_rwz_mainloop_asm, ITER, MASK); patchCode(cn_upx2_double_mainloop_asm, cnv2_rwz_double_mainloop_asm, ITER, MASK); } -#endif +# endif + +# ifdef XMRIG_ALGO_GHOSTRIDER + patchCode(cn_gr0_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr1_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr2_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr3_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr4_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr5_single_mainloop_asm, cnv1_single_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + + patchCode(cn_gr0_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr1_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr2_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr3_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr4_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr5_double_mainloop_asm, cnv1_double_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + + patchCode(cn_gr0_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr1_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr2_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr3_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr4_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); + patchCode(cn_gr5_quad_mainloop_asm, cnv1_quad_mainloop_asm, CnAlgo().iterations(), CnAlgo().mask()); +# endif VirtualMemory::protectRX(base, allocation_size); VirtualMemory::flushInstructionCache(base, allocation_size); diff --git a/src/crypto/cn/CryptoNight.h b/src/crypto/cn/CryptoNight.h index fc8d6787..897890d2 100644 --- a/src/crypto/cn/CryptoNight.h +++ b/src/crypto/cn/CryptoNight.h @@ -52,8 +52,10 @@ struct cryptonight_r_data { struct cryptonight_ctx { alignas(16) uint8_t state[224]; alignas(16) uint8_t *memory; + const uint32_t* tweak1_table; + uint64_t tweak1_2; - uint8_t unused[40]; + uint8_t unused[24]; const uint32_t *saes_table; cn_mainloop_fun_ms_abi generated_code; diff --git a/src/crypto/cn/CryptoNight_monero.h b/src/crypto/cn/CryptoNight_monero.h index f34c963c..a9975e78 100644 --- a/src/crypto/cn/CryptoNight_monero.h +++ b/src/crypto/cn/CryptoNight_monero.h @@ -204,4 +204,7 @@ v4_random_math(code##part, r##part); \ } +extern bool cn_sse41_enabled; +extern bool cn_vaes_enabled; + #endif /* XMRIG_CRYPTONIGHT_MONERO_H */ diff --git a/src/crypto/cn/CryptoNight_x86.h b/src/crypto/cn/CryptoNight_x86.h index 2fe64edb..0d27f5e2 100644 --- a/src/crypto/cn/CryptoNight_x86.h +++ b/src/crypto/cn/CryptoNight_x86.h @@ -295,8 +295,8 @@ static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx) constexpr CnAlgo props; # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes(ctx); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes(ctx, props.memory(), props.half_mem()); return; } # endif @@ -409,8 +409,8 @@ static NOINLINE void cn_implode_scratchpad(cryptonight_ctx *ctx) constexpr CnAlgo props; # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes(ctx); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes(ctx, props.memory(), props.half_mem()); return; } # endif @@ -634,9 +634,35 @@ static inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var) cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc)); } +#ifdef XMRIG_FEATURE_ASM +template +static void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height); +#endif + template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { +# ifdef XMRIG_FEATURE_ASM + if (!SOFT_AES) { + switch (ALGO) { + case Algorithm::CN_GR_0: + case Algorithm::CN_GR_1: + case Algorithm::CN_GR_2: + case Algorithm::CN_GR_3: + case Algorithm::CN_GR_4: + case Algorithm::CN_GR_5: + if (cn_sse41_enabled) { + cryptonight_single_hash_gr_sse41(input, size, output, ctx, height); + return; + } + break; + + default: + break; + } + } +# endif + constexpr CnAlgo props; constexpr size_t MASK = props.mask(); constexpr Algorithm::Id BASE = props.base(); @@ -822,13 +848,16 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si #ifdef XMRIG_FEATURE_ASM +extern "C" void cnv1_single_mainloop_asm(cryptonight_ctx * *ctx); +extern "C" void cnv1_double_mainloop_asm(cryptonight_ctx **ctx); +extern "C" void cnv1_quad_mainloop_asm(cryptonight_ctx **ctx); extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx **ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx **ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx **ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx); extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx); extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx); -extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx * *ctx); +extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx **ctx); namespace xmrig { @@ -865,6 +894,28 @@ extern cn_mainloop_fun cn_double_double_mainloop_sandybridge_asm; extern cn_mainloop_fun cn_upx2_mainloop_asm; extern cn_mainloop_fun cn_upx2_double_mainloop_asm; +extern cn_mainloop_fun cn_gr0_single_mainloop_asm; +extern cn_mainloop_fun cn_gr1_single_mainloop_asm; +extern cn_mainloop_fun cn_gr2_single_mainloop_asm; +extern cn_mainloop_fun cn_gr3_single_mainloop_asm; +extern cn_mainloop_fun cn_gr4_single_mainloop_asm; +extern cn_mainloop_fun cn_gr5_single_mainloop_asm; + +extern cn_mainloop_fun cn_gr0_double_mainloop_asm; +extern cn_mainloop_fun cn_gr1_double_mainloop_asm; +extern cn_mainloop_fun cn_gr2_double_mainloop_asm; +extern cn_mainloop_fun cn_gr3_double_mainloop_asm; +extern cn_mainloop_fun cn_gr4_double_mainloop_asm; +extern cn_mainloop_fun cn_gr5_double_mainloop_asm; + +extern cn_mainloop_fun cn_gr0_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr1_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr2_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr3_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr4_quad_mainloop_asm; +extern cn_mainloop_fun cn_gr5_quad_mainloop_asm; + + } // namespace xmrig @@ -1017,8 +1068,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ } # ifdef XMRIG_VAES - if (!props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1065,8 +1116,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ } # ifdef XMRIG_VAES - if (!props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1090,9 +1141,130 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ namespace xmrig { +#ifdef XMRIG_FEATURE_ASM +template +static NOINLINE void cryptonight_single_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height) +{ + constexpr CnAlgo props; + constexpr Algorithm::Id BASE = props.base(); + + if (BASE == Algorithm::CN_1 && size < 43) { + memset(output, 0, 32); + return; + } + + keccak(input, size, ctx[0]->state); + + if (props.half_mem()) { + ctx[0]->first_half = true; + } + cn_explode_scratchpad(ctx[0]); + + VARIANT1_INIT(0); + ctx[0]->tweak1_2 = tweak1_2_0; + ctx[0]->tweak1_table = tweak1_table; + if (ALGO == Algorithm::CN_GR_0) cn_gr0_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_1) cn_gr1_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_2) cn_gr2_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_3) cn_gr3_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_4) cn_gr4_single_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_5) cn_gr5_single_mainloop_asm(ctx); + + cn_implode_scratchpad(ctx[0]); + keccakf(reinterpret_cast(ctx[0]->state), 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +template +static NOINLINE void cryptonight_double_hash_gr_sse41(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) +{ + constexpr CnAlgo props; + constexpr Algorithm::Id BASE = props.base(); + + if (BASE == Algorithm::CN_1 && size < 43) { + memset(output, 0, 64); + return; + } + + keccak(input, size, ctx[0]->state); + keccak(input + size, size, ctx[1]->state); + + if (props.half_mem()) { + ctx[0]->first_half = true; + ctx[1]->first_half = true; + } + +# ifdef XMRIG_VAES + if (!props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + } + else +# endif + { + cn_explode_scratchpad(ctx[0]); + cn_explode_scratchpad(ctx[1]); + } + + VARIANT1_INIT(0); + VARIANT1_INIT(1); + + ctx[0]->tweak1_2 = tweak1_2_0; + ctx[1]->tweak1_2 = tweak1_2_1; + + ctx[0]->tweak1_table = tweak1_table; + + if (ALGO == Algorithm::CN_GR_0) cn_gr0_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_1) cn_gr1_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_2) cn_gr2_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_3) cn_gr3_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_4) cn_gr4_double_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_5) cn_gr5_double_mainloop_asm(ctx); + +# ifdef XMRIG_VAES + if (!props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + } + else +# endif + { + cn_implode_scratchpad(ctx[0]); + cn_implode_scratchpad(ctx[1]); + } + + keccakf(reinterpret_cast(ctx[0]->state), 24); + keccakf(reinterpret_cast(ctx[1]->state), 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} +#endif + + template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { +# ifdef XMRIG_FEATURE_ASM + if (!SOFT_AES) { + switch (ALGO) { + case Algorithm::CN_GR_0: + case Algorithm::CN_GR_1: + case Algorithm::CN_GR_2: + case Algorithm::CN_GR_3: + case Algorithm::CN_GR_4: + case Algorithm::CN_GR_5: + if (cn_sse41_enabled) { + cryptonight_double_hash_gr_sse41(input, size, output, ctx, height); + return; + } + break; + + default: + break; + } + } +# endif + constexpr CnAlgo props; constexpr size_t MASK = props.mask(); constexpr Algorithm::Id BASE = props.base(); @@ -1130,8 +1302,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1334,8 +1506,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); } else # endif @@ -1352,27 +1524,15 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } -static inline void cryptonight_monero_tweak_gr(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i cx) -{ - __m128i tmp = _mm_xor_si128(bx0, cx); - mem_out[0] = _mm_cvtsi128_si64(tmp); - - tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); - uint64_t vh = _mm_cvtsi128_si64(tmp); - - mem_out[1] = vh ^ tweak1_table[static_cast(vh) >> 24]; -} - - -template -void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height) +#ifdef XMRIG_FEATURE_ASM +template +static NOINLINE void cryptonight_quad_hash_gr_sse41(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height) { constexpr CnAlgo props; - constexpr size_t MASK = props.mask(); constexpr Algorithm::Id BASE = props.base(); if (BASE == Algorithm::CN_1 && size < 43) { - memset(output, 0, 64); + memset(output, 0, 32 * 4); return; } @@ -1381,21 +1541,6 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u keccak(input + size * 2, size, ctx[2]->state); keccak(input + size * 3, size, ctx[3]->state); - uint8_t* l0 = ctx[0]->memory; - uint8_t* l1 = ctx[1]->memory; - uint8_t* l2 = ctx[2]->memory; - uint8_t* l3 = ctx[3]->memory; - - uint64_t* h0 = reinterpret_cast(ctx[0]->state); - uint64_t* h1 = reinterpret_cast(ctx[1]->state); - uint64_t* h2 = reinterpret_cast(ctx[2]->state); - uint64_t* h3 = reinterpret_cast(ctx[3]->state); - - VARIANT1_INIT(0); - VARIANT1_INIT(1); - VARIANT1_INIT(2); - VARIANT1_INIT(3); - if (props.half_mem()) { ctx[0]->first_half = true; ctx[1]->first_half = true; @@ -1404,162 +1549,58 @@ void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, u } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_explode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif { - cn_explode_scratchpad(ctx[0]); - cn_explode_scratchpad(ctx[1]); - cn_explode_scratchpad(ctx[2]); - cn_explode_scratchpad(ctx[3]); + cn_explode_scratchpad(ctx[0]); + cn_explode_scratchpad(ctx[1]); + cn_explode_scratchpad(ctx[2]); + cn_explode_scratchpad(ctx[3]); } - uint64_t al0 = h0[0] ^ h0[4]; - uint64_t al1 = h1[0] ^ h1[4]; - uint64_t al2 = h2[0] ^ h2[4]; - uint64_t al3 = h3[0] ^ h3[4]; + VARIANT1_INIT(0); ctx[0]->tweak1_2 = tweak1_2_0; + VARIANT1_INIT(1); ctx[1]->tweak1_2 = tweak1_2_1; + VARIANT1_INIT(2); ctx[2]->tweak1_2 = tweak1_2_2; + VARIANT1_INIT(3); ctx[3]->tweak1_2 = tweak1_2_3; - uint64_t ah0 = h0[1] ^ h0[5]; - uint64_t ah1 = h1[1] ^ h1[5]; - uint64_t ah2 = h2[1] ^ h2[5]; - uint64_t ah3 = h3[1] ^ h3[5]; + ctx[0]->tweak1_table = tweak1_table; - __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx20 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i bx30 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - - uint64_t idx0 = al0; - uint64_t idx1 = al1; - uint64_t idx2 = al2; - uint64_t idx3 = al3; - - __m128i cx0, cx1, cx2, cx3; - - if (!SOFT_AES) { - cx0 = _mm_load_si128(reinterpret_cast(&l0[idx0 & MASK])); - cx1 = _mm_load_si128(reinterpret_cast(&l1[idx1 & MASK])); - cx2 = _mm_load_si128(reinterpret_cast(&l2[idx2 & MASK])); - cx3 = _mm_load_si128(reinterpret_cast(&l3[idx3 & MASK])); - } - - for (size_t i = 0; i < props.iterations(); i++) { - const __m128i ax0 = _mm_set_epi64x(ah0, al0); - const __m128i ax1 = _mm_set_epi64x(ah1, al1); - const __m128i ax2 = _mm_set_epi64x(ah2, al2); - const __m128i ax3 = _mm_set_epi64x(ah3, al3); - - if (SOFT_AES) { - cx0 = soft_aesenc(&l0[idx0 & MASK], ax0, reinterpret_cast(saes_table)); - cx1 = soft_aesenc(&l1[idx1 & MASK], ax1, reinterpret_cast(saes_table)); - cx2 = soft_aesenc(&l2[idx2 & MASK], ax2, reinterpret_cast(saes_table)); - cx3 = soft_aesenc(&l3[idx3 & MASK], ax3, reinterpret_cast(saes_table)); - } - else { - cx0 = _mm_aesenc_si128(cx0, ax0); - cx1 = _mm_aesenc_si128(cx1, ax1); - cx2 = _mm_aesenc_si128(cx2, ax2); - cx3 = _mm_aesenc_si128(cx3, ax3); - if (MASK > 131072) { - _mm_prefetch((const char*)(&l0[_mm_cvtsi128_si32(cx0) & MASK]), _MM_HINT_T0); - _mm_prefetch((const char*)(&l1[_mm_cvtsi128_si32(cx1) & MASK]), _MM_HINT_T0); - _mm_prefetch((const char*)(&l2[_mm_cvtsi128_si32(cx2) & MASK]), _MM_HINT_T0); - _mm_prefetch((const char*)(&l3[_mm_cvtsi128_si32(cx3) & MASK]), _MM_HINT_T0); - } - } - - cryptonight_monero_tweak_gr((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx00, cx0); - cryptonight_monero_tweak_gr((uint64_t*)&l1[idx1 & MASK], l1, idx1 & MASK, ax1, bx10, cx1); - cryptonight_monero_tweak_gr((uint64_t*)&l2[idx2 & MASK], l2, idx2 & MASK, ax2, bx20, cx2); - cryptonight_monero_tweak_gr((uint64_t*)&l3[idx3 & MASK], l3, idx3 & MASK, ax3, bx30, cx3); - - idx0 = _mm_cvtsi128_si64(cx0); - idx1 = _mm_cvtsi128_si64(cx1); - idx2 = _mm_cvtsi128_si64(cx2); - idx3 = _mm_cvtsi128_si64(cx3); - - uint64_t hi, lo, cl, ch; - - cl = ((uint64_t*)&l0[idx0 & MASK])[0]; - ch = ((uint64_t*)&l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - al0 += hi; - ah0 += lo; - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; - al0 ^= cl; - ah0 ^= ch; - idx0 = al0; - bx00 = cx0; - if (!SOFT_AES) cx0 = _mm_load_si128(reinterpret_cast(&l0[idx0 & MASK])); - - cl = ((uint64_t*)&l1[idx1 & MASK])[0]; - ch = ((uint64_t*)&l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - al1 += hi; - ah1 += lo; - ((uint64_t*)&l1[idx1 & MASK])[0] = al1; - ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1; - al1 ^= cl; - ah1 ^= ch; - idx1 = al1; - bx10 = cx1; - if (!SOFT_AES) cx1 = _mm_load_si128(reinterpret_cast(&l1[idx1 & MASK])); - - cl = ((uint64_t*)&l2[idx2 & MASK])[0]; - ch = ((uint64_t*)&l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); - al2 += hi; - ah2 += lo; - ((uint64_t*)&l2[idx2 & MASK])[0] = al2; - ((uint64_t*)&l2[idx2 & MASK])[1] = ah2 ^ tweak1_2_2; - al2 ^= cl; - ah2 ^= ch; - idx2 = al2; - bx20 = cx2; - if (!SOFT_AES) cx2 = _mm_load_si128(reinterpret_cast(&l2[idx2 & MASK])); - - cl = ((uint64_t*)&l3[idx3 & MASK])[0]; - ch = ((uint64_t*)&l3[idx3 & MASK])[1]; - lo = __umul128(idx3, cl, &hi); - al3 += hi; - ah3 += lo; - ((uint64_t*)&l3[idx3 & MASK])[0] = al3; - ((uint64_t*)&l3[idx3 & MASK])[1] = ah3 ^ tweak1_2_3; - al3 ^= cl; - ah3 ^= ch; - idx3 = al3; - bx30 = cx3; - if (!SOFT_AES) cx3 = _mm_load_si128(reinterpret_cast(&l3[idx3 & MASK])); - } + if (ALGO == Algorithm::CN_GR_0) cn_gr0_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_1) cn_gr1_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_2) cn_gr2_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_3) cn_gr3_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_4) cn_gr4_quad_mainloop_asm(ctx); + if (ALGO == Algorithm::CN_GR_5) cn_gr5_quad_mainloop_asm(ctx); # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_implode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif { - cn_implode_scratchpad(ctx[0]); - cn_implode_scratchpad(ctx[1]); - cn_implode_scratchpad(ctx[2]); - cn_implode_scratchpad(ctx[3]); + cn_implode_scratchpad(ctx[0]); + cn_implode_scratchpad(ctx[1]); + cn_implode_scratchpad(ctx[2]); + cn_implode_scratchpad(ctx[3]); } - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - keccakf(h3, 24); + keccakf(reinterpret_cast(ctx[0]->state), 24); + keccakf(reinterpret_cast(ctx[1]->state), 24); + keccakf(reinterpret_cast(ctx[2]->state), 24); + keccakf(reinterpret_cast(ctx[3]->state), 24); extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); extra_hashes[ctx[2]->state[0] & 3](ctx[2]->state, 200, output + 64); extra_hashes[ctx[3]->state[0] & 3](ctx[3]->state, 200, output + 96); } +#endif #define CN_STEP1(a, b0, b1, c, l, ptr, idx, conc_var) \ @@ -1755,13 +1796,26 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si template inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height) { - const auto arch = Cpu::info()->arch(); - if ((arch >= ICpuInfo::ARCH_ZEN) && (arch <= ICpuInfo::ARCH_ZEN3)) { - if ((ALGO == Algorithm::CN_GR_0) || (ALGO == Algorithm::CN_GR_1) || (ALGO == Algorithm::CN_GR_2) || (ALGO == Algorithm::CN_GR_3) || (ALGO == Algorithm::CN_GR_4) || (ALGO == Algorithm::CN_GR_5)) { - cryptonight_quad_hash_zen(input, size, output, ctx, height); - return; +# ifdef XMRIG_FEATURE_ASM + if (!SOFT_AES) { + switch (ALGO) { + case Algorithm::CN_GR_0: + case Algorithm::CN_GR_1: + case Algorithm::CN_GR_2: + case Algorithm::CN_GR_3: + case Algorithm::CN_GR_4: + case Algorithm::CN_GR_5: + if (cn_sse41_enabled) { + cryptonight_quad_hash_gr_sse41(input, size, output, ctx, height); + return; + } + break; + + default: + break; } } +# endif constexpr CnAlgo props; constexpr size_t MASK = props.mask(); @@ -1788,9 +1842,9 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_explode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_explode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_explode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_explode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif @@ -1851,9 +1905,9 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size } # ifdef XMRIG_VAES - if (!SOFT_AES && !props.isHeavy() && Cpu::info()->hasVAES()) { - cn_implode_scratchpad_vaes_double(ctx[0], ctx[1]); - cn_implode_scratchpad_vaes_double(ctx[2], ctx[3]); + if (!SOFT_AES && !props.isHeavy() && cn_vaes_enabled) { + cn_implode_scratchpad_vaes_double(ctx[0], ctx[1], props.memory(), props.half_mem()); + cn_implode_scratchpad_vaes_double(ctx[2], ctx[3], props.memory(), props.half_mem()); } else # endif diff --git a/src/crypto/cn/CryptoNight_x86_vaes.cpp b/src/crypto/cn/CryptoNight_x86_vaes.cpp index 177da813..41da111b 100644 --- a/src/crypto/cn/CryptoNight_x86_vaes.cpp +++ b/src/crypto/cn/CryptoNight_x86_vaes.cpp @@ -162,12 +162,9 @@ static FORCEINLINE void vaes_round(__m256i key, __m256i& x0, __m256i& x1, __m256 namespace xmrig { -template -NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) +NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1); __m256i xin01, xin23, xin45, xin67; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -177,18 +174,18 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) vaes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); - if (props.half_mem() && !ctx->first_half) { + if (half_mem && !ctx->first_half) { const __m256i* p = reinterpret_cast(ctx->save_state); - xin01 = _mm256_load_si256(p + 0); - xin23 = _mm256_load_si256(p + 1); - xin45 = _mm256_load_si256(p + 2); - xin67 = _mm256_load_si256(p + 3); + xin01 = _mm256_loadu_si256(p + 0); + xin23 = _mm256_loadu_si256(p + 1); + xin45 = _mm256_loadu_si256(p + 2); + xin67 = _mm256_loadu_si256(p + 3); } else { - xin01 = _mm256_load_si256(reinterpret_cast(input + 4)); - xin23 = _mm256_load_si256(reinterpret_cast(input + 6)); - xin45 = _mm256_load_si256(reinterpret_cast(input + 8)); - xin67 = _mm256_load_si256(reinterpret_cast(input + 10)); + xin01 = _mm256_loadu_si256(reinterpret_cast(input + 4)); + xin23 = _mm256_loadu_si256(reinterpret_cast(input + 6)); + xin45 = _mm256_loadu_si256(reinterpret_cast(input + 8)); + xin67 = _mm256_loadu_si256(reinterpret_cast(input + 10)); } constexpr int output_increment = 64 / sizeof(__m256i); @@ -226,24 +223,21 @@ NOINLINE void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx) prefetch_ptr = output; } - if (props.half_mem() && ctx->first_half) { + if (half_mem && ctx->first_half) { __m256i* p = reinterpret_cast<__m256i*>(ctx->save_state); - _mm256_store_si256(p + 0, xin01); - _mm256_store_si256(p + 1, xin23); - _mm256_store_si256(p + 2, xin45); - _mm256_store_si256(p + 3, xin67); + _mm256_storeu_si256(p + 0, xin01); + _mm256_storeu_si256(p + 1, xin23); + _mm256_storeu_si256(p + 2, xin45); + _mm256_storeu_si256(p + 3, xin67); } _mm256_zeroupper(); } -template -NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2) +NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1); __m256i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -257,7 +251,7 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig vaes_genkey_double(input1, input2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); { - const bool b = props.half_mem() && !ctx1->first_half && !ctx2->first_half; + const bool b = half_mem && !ctx1->first_half && !ctx2->first_half; const __m128i* p1 = b ? reinterpret_cast(ctx1->save_state) : (input1 + 4); const __m128i* p2 = b ? reinterpret_cast(ctx2->save_state) : (input2 + 4); xin0 = _mm256_loadu2_m128i(p2 + 0, p1 + 0); @@ -315,7 +309,7 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig prefetch_ptr2 = output2; } - if (props.half_mem() && ctx1->first_half && ctx2->first_half) { + if (half_mem && ctx1->first_half && ctx2->first_half) { __m128i* p1 = reinterpret_cast<__m128i*>(ctx1->save_state); __m128i* p2 = reinterpret_cast<__m128i*>(ctx2->save_state); _mm256_storeu2_m128i(p2 + 0, p1 + 0, xin0); @@ -332,12 +326,9 @@ NOINLINE void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig } -template -NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) +NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m256i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m256i)) / (half_mem ? 2 : 1); __m256i xout01, xout23, xout45, xout67; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -347,17 +338,17 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) vaes_genkey(reinterpret_cast<__m128i*>(output) + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); - xout01 = _mm256_load_si256(output + 2); - xout23 = _mm256_load_si256(output + 3); - xout45 = _mm256_load_si256(output + 4); - xout67 = _mm256_load_si256(output + 5); + xout01 = _mm256_loadu_si256(output + 2); + xout23 = _mm256_loadu_si256(output + 3); + xout45 = _mm256_loadu_si256(output + 4); + xout67 = _mm256_loadu_si256(output + 5); const __m256i* input_begin = input; - for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) { - if (props.half_mem() && (part == 1)) { + for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) { + if (half_mem && (part == 1)) { input = input_begin; ctx->first_half = false; - cn_explode_scratchpad_vaes(ctx); + cn_explode_scratchpad_vaes(ctx, memory, half_mem); } for (size_t i = 0; i < N;) { @@ -390,21 +381,18 @@ NOINLINE void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx) } } - _mm256_store_si256(output + 2, xout01); - _mm256_store_si256(output + 3, xout23); - _mm256_store_si256(output + 4, xout45); - _mm256_store_si256(output + 5, xout67); + _mm256_storeu_si256(output + 2, xout01); + _mm256_storeu_si256(output + 3, xout23); + _mm256_storeu_si256(output + 4, xout45); + _mm256_storeu_si256(output + 5, xout67); _mm256_zeroupper(); } -template -NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2) +NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem) { - constexpr CnAlgo props; - - constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1); + const size_t N = (memory / sizeof(__m128i)) / (half_mem ? 2 : 1); __m256i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; __m256i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; @@ -428,13 +416,13 @@ NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig const __m128i* input_begin1 = input1; const __m128i* input_begin2 = input2; - for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) { - if (props.half_mem() && (part == 1)) { + for (size_t part = 0; part < (half_mem ? 2 : 1); ++part) { + if (half_mem && (part == 1)) { input1 = input_begin1; input2 = input_begin2; ctx1->first_half = false; ctx2->first_half = false; - cn_explode_scratchpad_vaes_double(ctx1, ctx2); + cn_explode_scratchpad_vaes_double(ctx1, ctx2, memory, half_mem); } for (size_t i = 0; i < N;) { @@ -487,44 +475,4 @@ NOINLINE void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonig } -template -void VAES_Instance() -{ - cn_explode_scratchpad_vaes(nullptr); - cn_explode_scratchpad_vaes_double(nullptr, nullptr); - cn_implode_scratchpad_vaes(nullptr); - cn_implode_scratchpad_vaes_double(nullptr, nullptr); -} - - -void (*vaes_instances[])() = { - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, - VAES_Instance, -}; - - } // xmrig diff --git a/src/crypto/cn/CryptoNight_x86_vaes.h b/src/crypto/cn/CryptoNight_x86_vaes.h index 475780b8..1c824ecf 100644 --- a/src/crypto/cn/CryptoNight_x86_vaes.h +++ b/src/crypto/cn/CryptoNight_x86_vaes.h @@ -36,10 +36,10 @@ struct cryptonight_ctx; namespace xmrig { -template void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx); -template void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2); -template void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx); -template void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2); +void cn_explode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem); +void cn_explode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem); +void cn_implode_scratchpad_vaes(cryptonight_ctx* ctx, size_t memory, bool half_mem); +void cn_implode_scratchpad_vaes_double(cryptonight_ctx* ctx1, cryptonight_ctx* ctx2, size_t memory, bool half_mem); } // xmrig diff --git a/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc b/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc new file mode 100644 index 00000000..896c1a2c --- /dev/null +++ b/src/crypto/cn/asm/cn1/cnv1_double_main_loop.inc @@ -0,0 +1,132 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 32 + mov rdx, QWORD PTR [rcx] + mov r8, QWORD PTR [rcx+8] + mov r12d, 524288 + movaps XMMWORD PTR [rsp+16], xmm6 + mov rbx, QWORD PTR [rdx+32] + xor rbx, QWORD PTR [rdx] + mov rsi, QWORD PTR [rdx+40] + mov r10, rbx + xor rsi, QWORD PTR [rdx+8] + and r10d, 2097136 + mov rdi, QWORD PTR [r8+32] + xor rdi, QWORD PTR [r8] + movq xmm3, rbx + mov rbp, QWORD PTR [r8+40] + mov r9, rdi + xor rbp, QWORD PTR [r8+8] + movq xmm0, rsi + mov rcx, QWORD PTR [rdx+56] + and r9d, 2097136 + xor rcx, QWORD PTR [rdx+24] + movq xmm4, rdi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov r14, QWORD PTR [rdx+224] + mov r13, QWORD PTR [rdx+232] + mov r15, QWORD PTR [r8+224] + punpcklqdq xmm3, xmm0 + movq xmm0, rbp + movq xmm5, rax + punpcklqdq xmm4, xmm0 + mov rax, QWORD PTR [r8+48] + movq xmm0, rcx + xor rax, QWORD PTR [r8+16] + mov rcx, QWORD PTR [r8+56] + xor rcx, QWORD PTR [r8+24] + movdqu xmm1, XMMWORD PTR [r14+r10] + movq xmm6, rax + punpcklqdq xmm5, xmm0 + mov rax, QWORD PTR [rdx+240] + movq xmm0, rcx + movdqu xmm2, XMMWORD PTR [r15+r9] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r8+240] + mov QWORD PTR [rsp+8], rax + punpcklqdq xmm6, xmm0 + + ALIGN(64) +main_loop_cnv1_double: + aesenc xmm1, xmm3 + aesenc xmm2, xmm4 + movdqa xmm0, xmm1 + movq r11, xmm2 + pxor xmm0, xmm5 + movdqa xmm5, xmm1 + movq QWORD PTR [r14+r10], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + movdqa xmm0, xmm2 + shr rax, 24 + pxor xmm0, xmm6 + movdqa xmm6, xmm2 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + mov QWORD PTR [r14+r10+8], rax + movq QWORD PTR [r15+r9], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + shr rax, 24 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + movq rcx, xmm1 + mov QWORD PTR [r15+r9+8], rax + mov r9, rcx + and r9d, 2097136 + mov r10, QWORD PTR [r14+r9] + mov r8, QWORD PTR [r14+r9+8] + mov rax, r10 + mul rcx + add rsi, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [r14+r9], rbx + xor rax, rsi + mov QWORD PTR [r14+r9+8], rax + xor rsi, r8 + xor rbx, r10 + mov r8, r11 + and r8d, 2097136 + mov r10, rbx + and r10d, 2097136 + movq xmm3, rbx + pinsrq xmm3, rsi, 1 + mov r9, QWORD PTR [r15+r8] + mov rcx, QWORD PTR [r15+r8+8] + mov rax, r9 + movdqu xmm1, XMMWORD PTR [r14+r10] + mul r11 + add rbp, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r15+r8], rdi + xor rax, rbp + xor rdi, r9 + mov QWORD PTR [r15+r8+8], rax + mov r9, rdi + xor rbp, rcx + and r9d, 2097136 + movq xmm4, rdi + pinsrq xmm4, rbp, 1 + movdqu xmm2, XMMWORD PTR [r15+r9] + sub r12, 1 + jne main_loop_cnv1_double + + mov rbx, QWORD PTR [rsp+80] + mov rbp, QWORD PTR [rsp+88] + mov rsi, QWORD PTR [rsp+96] + movaps xmm6, XMMWORD PTR [rsp+16] + add rsp, 32 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc b/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc new file mode 100644 index 00000000..2b61ced0 --- /dev/null +++ b/src/crypto/cn/asm/cn1/cnv1_quad_main_loop.inc @@ -0,0 +1,263 @@ + mov rax, rsp + mov QWORD PTR [rax+8], rbx + mov QWORD PTR [rax+16], rbp + mov QWORD PTR [rax+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 144 + mov r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+8] + mov r10, QWORD PTR [rcx+16] + mov r11, QWORD PTR [rcx+24] + mov rbp, QWORD PTR [r8+224] + mov r13, QWORD PTR [r8+232] + mov r14, QWORD PTR [r9+224] + mov r15, QWORD PTR [r10+224] + mov r12, QWORD PTR [r11+224] + mov rcx, QWORD PTR [r8+40] + xor rcx, QWORD PTR [r8+8] + mov rbx, QWORD PTR [r8+32] + xor rbx, QWORD PTR [r8] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + movq xmm0, rcx + mov rcx, QWORD PTR [r9+40] + xor rcx, QWORD PTR [r9+8] + movq xmm1, rbx + movaps XMMWORD PTR [rax-56], xmm6 + movaps XMMWORD PTR [rax-72], xmm7 + movaps XMMWORD PTR [rax-88], xmm8 + movaps XMMWORD PTR [rax-104], xmm9 + movaps XMMWORD PTR [rax-120], xmm10 + movaps XMMWORD PTR [rsp+48], xmm11 + movaps XMMWORD PTR [rsp+32], xmm12 + and ebx, 2097136 + mov rsi, QWORD PTR [r10+32] + movq xmm2, rdi + mov rax, QWORD PTR [r8+240] + and edi, 2097136 + xor rsi, QWORD PTR [r10] + mov rdx, QWORD PTR [r8+56] + xor rdx, QWORD PTR [r8+24] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r9+240] + movq xmm3, rsi + mov QWORD PTR [rsp+8], rax + and esi, 2097136 + mov rax, QWORD PTR [r10+240] + punpcklqdq xmm1, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r10+40] + xor rcx, QWORD PTR [r10+8] + mov QWORD PTR [rsp+16], rax + mov rax, QWORD PTR [r11+240] + punpcklqdq xmm2, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+24], rax + mov rcx, QWORD PTR [r11+40] + xor rcx, QWORD PTR [r11+8] + mov rax, QWORD PTR [r11+32] + xor rax, QWORD PTR [r11] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r8+48] + xor rcx, QWORD PTR [r8+16] + movq xmm4, rax + and eax, 2097136 + punpcklqdq xmm4, xmm0 + movq xmm0, rdx + mov rdx, QWORD PTR [r9+56] + xor rdx, QWORD PTR [r9+24] + movq xmm5, rcx + mov rcx, QWORD PTR [r9+48] + xor rcx, QWORD PTR [r9+16] + punpcklqdq xmm5, xmm0 + movq xmm0, rdx + mov rdx, QWORD PTR [r10+56] + xor rdx, QWORD PTR [r10+24] + movq xmm6, rcx + mov rcx, QWORD PTR [r10+48] + xor rcx, QWORD PTR [r10+16] + punpcklqdq xmm6, xmm0 + movq xmm0, rdx + mov rdx, QWORD PTR [r11+56] + movq xmm7, rcx + punpcklqdq xmm7, xmm0 + xor rdx, QWORD PTR [r11+24] + mov rcx, QWORD PTR [r11+48] + xor rcx, QWORD PTR [r11+16] + mov r11d, 524288 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + movdqu xmm10, XMMWORD PTR [r14+rdi] + movq xmm0, rdx + movdqu xmm11, XMMWORD PTR [r15+rsi] + movdqu xmm12, XMMWORD PTR [r12+rax] + movq xmm8, rcx + punpcklqdq xmm8, xmm0 + + ALIGN(64) +main_loop_cnv1_quad: + aesenc xmm9, xmm1 + aesenc xmm10, xmm2 + aesenc xmm11, xmm3 + aesenc xmm12, xmm4 + movd ecx, xmm9 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+rbp] + movd ecx, xmm10 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r14] + movd ecx, xmm11 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r15] + movd ecx, xmm12 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r12] + movdqa xmm0, xmm9 + pxor xmm0, xmm5 + movdqa xmm5, xmm9 + movq QWORD PTR [rbp+rbx], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm10 + shr rcx, 24 + pxor xmm0, xmm6 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [rbp+rbx+8], rcx + movq rbx, xmm1 + movq QWORD PTR [r14+rdi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm11 + shr rcx, 24 + pxor xmm0, xmm7 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r14+rdi+8], rcx + movq rdi, xmm2 + movq QWORD PTR [r15+rsi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm12 + shr rcx, 24 + pxor xmm0, xmm8 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r15+rsi+8], rcx + movq QWORD PTR [r12+rax], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + shr rcx, 24 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r12+rax+8], rcx + movq rcx, xmm9 + mov r8, rcx + and r8d, 2097136 + mov r9, QWORD PTR [rbp+r8] + mov r10, QWORD PTR [rbp+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm1, 1 + add rcx, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [rbp+r8], rbx + xor rax, rcx + mov QWORD PTR [rbp+r8+8], rax + xor rcx, r10 + xor rbx, r9 + movq xmm1, rbx + and ebx, 2097136 + pinsrq xmm1, rcx, 1 + movq rcx, xmm10 + mov r8, rcx + and r8d, 2097136 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + mov r9, QWORD PTR [r14+r8] + mov r10, QWORD PTR [r14+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm2, 1 + add rcx, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r14+r8], rdi + xor rax, rcx + xor rdi, r9 + mov QWORD PTR [r14+r8+8], rax + xor rcx, r10 + movq xmm2, rdi + and edi, 2097136 + pinsrq xmm2, rcx, 1 + movq rcx, xmm11 + movq rsi, xmm3 + mov r8, rcx + and r8d, 2097136 + movdqa xmm6, xmm10 + movdqa xmm7, xmm11 + movdqa xmm8, xmm12 + movdqu xmm10, XMMWORD PTR [r14+rdi] + mov r9, QWORD PTR [r15+r8] + mov r10, QWORD PTR [r15+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm3, 1 + add rcx, rax + add rsi, rdx + mov rax, QWORD PTR [rsp+16] + xor rax, rcx + mov QWORD PTR [r15+r8], rsi + mov QWORD PTR [r15+r8+8], rax + xor rcx, r10 + xor rsi, r9 + movq xmm3, rsi + and esi, 2097136 + pinsrq xmm3, rcx, 1 + movq rcx, xmm12 + mov r8, rcx + and r8d, 2097136 + movdqu xmm11, XMMWORD PTR [r15+rsi] + mov r9, QWORD PTR [r12+r8] + mov r10, QWORD PTR [r12+r8+8] + mov rax, r9 + mul rcx + mov rcx, rax + movq rax, xmm4 + add rax, rdx + mov QWORD PTR [r12+r8], rax + xor rax, r9 + pextrq rdx, xmm4, 1 + add rdx, rcx + mov rcx, QWORD PTR [rsp+24] + xor rcx, rdx + xor rdx, r10 + movq xmm4, rax + mov QWORD PTR [r12+r8+8], rcx + and eax, 2097136 + pinsrq xmm4, rdx, 1 + movdqu xmm12, XMMWORD PTR [r12+rax] + sub r11, 1 + jne main_loop_cnv1_quad + + movaps xmm7, XMMWORD PTR [rsp+112] + lea r11, QWORD PTR [rsp+144] + mov rbx, QWORD PTR [r11+48] + mov rbp, QWORD PTR [r11+56] + mov rsi, QWORD PTR [r11+64] + movaps xmm6, XMMWORD PTR [r11-16] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm9, XMMWORD PTR [r11-64] + movaps xmm10, XMMWORD PTR [r11-80] + movaps xmm11, XMMWORD PTR [r11-96] + movaps xmm12, XMMWORD PTR [r11-112] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc b/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc new file mode 100644 index 00000000..62558c3c --- /dev/null +++ b/src/crypto/cn/asm/cn1/cnv1_single_main_loop.inc @@ -0,0 +1,66 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r13 + push r14 + push r15 + mov rdx, QWORD PTR [rcx] + mov esi, 524288 + mov r11, QWORD PTR [rdx+32] + xor r11, QWORD PTR [rdx] + mov rdi, QWORD PTR [rdx+224] + mov rbx, QWORD PTR [rdx+40] + xor rbx, QWORD PTR [rdx+8] + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov rbp, QWORD PTR [rdx+240] + mov r14, QWORD PTR [rdx+232] + movq xmm2, rax + pinsrq xmm2, rcx, 1 + + ALIGN(64) +main_loop_cnv1_single: + mov r8, r11 + and r8d, 2097136 + movdqu xmm1, XMMWORD PTR [rdi+r8] + movq xmm0, r11 + pinsrq xmm0, rbx, 1 + aesenc xmm1, xmm0 + movq r15, xmm1 + mov r9, r15 + and r9d, 2097136 + movdqa xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm2, xmm1 + movq QWORD PTR [rdi+r8], xmm0 + pextrq rdx, xmm0, 1 + mov eax, edx + shr rax, 24 + mov ecx, DWORD PTR [r14+rax*4] + xor rcx, rdx + mov QWORD PTR [rdi+r8+8], rcx + mov r10, QWORD PTR [rdi+r9] + mov r8, QWORD PTR [rdi+r9+8] + mov rax, r10 + mul r15 + add rbx, rax + add r11, rdx + mov QWORD PTR [rdi+r9], r11 + mov rax, rbx + xor rbx, r8 + xor r11, r10 + xor rax, rbp + mov QWORD PTR [rdi+r9+8], rax + sub rsi, 1 + jne main_loop_cnv1_single + + pop r15 + pop r14 + pop r13 + mov rbx, QWORD PTR [rsp+8] + mov rbp, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + mov rdi, QWORD PTR [rsp+32] diff --git a/src/crypto/cn/asm/cn_main_loop.S b/src/crypto/cn/asm/cn_main_loop.S index 0dfd3ee2..527e20e1 100644 --- a/src/crypto/cn/asm/cn_main_loop.S +++ b/src/crypto/cn/asm/cn_main_loop.S @@ -11,6 +11,9 @@ # define FN_PREFIX(fn) fn .section .text #endif +.global FN_PREFIX(cnv1_single_mainloop_asm) +.global FN_PREFIX(cnv1_double_mainloop_asm) +.global FN_PREFIX(cnv1_quad_mainloop_asm) .global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) @@ -19,6 +22,33 @@ .global FN_PREFIX(cnv2_rwz_double_mainloop_asm) .global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm) +ALIGN(64) +FN_PREFIX(cnv1_single_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn1/cnv1_single_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv1_double_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn1/cnv1_double_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + +ALIGN(64) +FN_PREFIX(cnv1_quad_mainloop_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn1/cnv1_quad_main_loop.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + ALIGN(64) FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 diff --git a/src/crypto/cn/asm/cn_main_loop.asm b/src/crypto/cn/asm/cn_main_loop.asm index 97ae5299..0979580a 100644 --- a/src/crypto/cn/asm/cn_main_loop.asm +++ b/src/crypto/cn/asm/cn_main_loop.asm @@ -1,4 +1,7 @@ _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cnv1_single_mainloop_asm +PUBLIC cnv1_double_mainloop_asm +PUBLIC cnv1_quad_mainloop_asm PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm @@ -6,6 +9,27 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm PUBLIC cnv2_rwz_mainloop_asm PUBLIC cnv2_rwz_double_mainloop_asm +ALIGN(64) +cnv1_single_mainloop_asm PROC + INCLUDE cn1/cnv1_single_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_single_mainloop_asm ENDP + +ALIGN(64) +cnv1_double_mainloop_asm PROC + INCLUDE cn1/cnv1_double_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_double_mainloop_asm ENDP + +ALIGN(64) +cnv1_quad_mainloop_asm PROC + INCLUDE cn1/cnv1_quad_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_quad_mainloop_asm ENDP + ALIGN(64) cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc diff --git a/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc b/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc new file mode 100644 index 00000000..ac0cdb21 --- /dev/null +++ b/src/crypto/cn/asm/win64/cn1/cnv1_double_main_loop.inc @@ -0,0 +1,132 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 32 + mov rdx, QWORD PTR [rcx] + mov r8, QWORD PTR [rcx+8] + mov r12d, 524288 + movaps XMMWORD PTR [rsp+16], xmm6 + mov rbx, QWORD PTR [rdx+32] + xor rbx, QWORD PTR [rdx] + mov rsi, QWORD PTR [rdx+40] + mov r10, rbx + xor rsi, QWORD PTR [rdx+8] + and r10d, 2097136 + mov rdi, QWORD PTR [r8+32] + xor rdi, QWORD PTR [r8] + movd xmm3, rbx + mov rbp, QWORD PTR [r8+40] + mov r9, rdi + xor rbp, QWORD PTR [r8+8] + movd xmm0, rsi + mov rcx, QWORD PTR [rdx+56] + and r9d, 2097136 + xor rcx, QWORD PTR [rdx+24] + movd xmm4, rdi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov r14, QWORD PTR [rdx+224] + mov r13, QWORD PTR [rdx+232] + mov r15, QWORD PTR [r8+224] + punpcklqdq xmm3, xmm0 + movd xmm0, rbp + movd xmm5, rax + punpcklqdq xmm4, xmm0 + mov rax, QWORD PTR [r8+48] + movd xmm0, rcx + xor rax, QWORD PTR [r8+16] + mov rcx, QWORD PTR [r8+56] + xor rcx, QWORD PTR [r8+24] + movdqu xmm1, XMMWORD PTR [r14+r10] + movd xmm6, rax + punpcklqdq xmm5, xmm0 + mov rax, QWORD PTR [rdx+240] + movd xmm0, rcx + movdqu xmm2, XMMWORD PTR [r15+r9] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r8+240] + mov QWORD PTR [rsp+8], rax + punpcklqdq xmm6, xmm0 + + ALIGN(64) +main_loop_cnv1_double: + aesenc xmm1, xmm3 + aesenc xmm2, xmm4 + movdqa xmm0, xmm1 + movd r11, xmm2 + pxor xmm0, xmm5 + movdqa xmm5, xmm1 + movd QWORD PTR [r14+r10], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + movdqa xmm0, xmm2 + shr rax, 24 + pxor xmm0, xmm6 + movdqa xmm6, xmm2 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + mov QWORD PTR [r14+r10+8], rax + movd QWORD PTR [r15+r9], xmm0 + pextrq rcx, xmm0, 1 + mov eax, ecx + shr rax, 24 + mov eax, DWORD PTR [r13+rax*4] + xor rax, rcx + movd rcx, xmm1 + mov QWORD PTR [r15+r9+8], rax + mov r9, rcx + and r9d, 2097136 + mov r10, QWORD PTR [r14+r9] + mov r8, QWORD PTR [r14+r9+8] + mov rax, r10 + mul rcx + add rsi, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [r14+r9], rbx + xor rax, rsi + mov QWORD PTR [r14+r9+8], rax + xor rsi, r8 + xor rbx, r10 + mov r8, r11 + and r8d, 2097136 + mov r10, rbx + and r10d, 2097136 + movd xmm3, rbx + pinsrq xmm3, rsi, 1 + mov r9, QWORD PTR [r15+r8] + mov rcx, QWORD PTR [r15+r8+8] + mov rax, r9 + movdqu xmm1, XMMWORD PTR [r14+r10] + mul r11 + add rbp, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r15+r8], rdi + xor rax, rbp + xor rdi, r9 + mov QWORD PTR [r15+r8+8], rax + mov r9, rdi + xor rbp, rcx + and r9d, 2097136 + movd xmm4, rdi + pinsrq xmm4, rbp, 1 + movdqu xmm2, XMMWORD PTR [r15+r9] + sub r12, 1 + jne main_loop_cnv1_double + + mov rbx, QWORD PTR [rsp+80] + mov rbp, QWORD PTR [rsp+88] + mov rsi, QWORD PTR [rsp+96] + movaps xmm6, XMMWORD PTR [rsp+16] + add rsp, 32 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc b/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc new file mode 100644 index 00000000..78d145a1 --- /dev/null +++ b/src/crypto/cn/asm/win64/cn1/cnv1_quad_main_loop.inc @@ -0,0 +1,263 @@ + mov rax, rsp + mov QWORD PTR [rax+8], rbx + mov QWORD PTR [rax+16], rbp + mov QWORD PTR [rax+24], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 144 + mov r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+8] + mov r10, QWORD PTR [rcx+16] + mov r11, QWORD PTR [rcx+24] + mov rbp, QWORD PTR [r8+224] + mov r13, QWORD PTR [r8+232] + mov r14, QWORD PTR [r9+224] + mov r15, QWORD PTR [r10+224] + mov r12, QWORD PTR [r11+224] + mov rcx, QWORD PTR [r8+40] + xor rcx, QWORD PTR [r8+8] + mov rbx, QWORD PTR [r8+32] + xor rbx, QWORD PTR [r8] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + movd xmm0, rcx + mov rcx, QWORD PTR [r9+40] + xor rcx, QWORD PTR [r9+8] + movd xmm1, rbx + movaps XMMWORD PTR [rax-56], xmm6 + movaps XMMWORD PTR [rax-72], xmm7 + movaps XMMWORD PTR [rax-88], xmm8 + movaps XMMWORD PTR [rax-104], xmm9 + movaps XMMWORD PTR [rax-120], xmm10 + movaps XMMWORD PTR [rsp+48], xmm11 + movaps XMMWORD PTR [rsp+32], xmm12 + and ebx, 2097136 + mov rsi, QWORD PTR [r10+32] + movd xmm2, rdi + mov rax, QWORD PTR [r8+240] + and edi, 2097136 + xor rsi, QWORD PTR [r10] + mov rdx, QWORD PTR [r8+56] + xor rdx, QWORD PTR [r8+24] + mov QWORD PTR [rsp], rax + mov rax, QWORD PTR [r9+240] + movd xmm3, rsi + mov QWORD PTR [rsp+8], rax + and esi, 2097136 + mov rax, QWORD PTR [r10+240] + punpcklqdq xmm1, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r10+40] + xor rcx, QWORD PTR [r10+8] + mov QWORD PTR [rsp+16], rax + mov rax, QWORD PTR [r11+240] + punpcklqdq xmm2, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+24], rax + mov rcx, QWORD PTR [r11+40] + xor rcx, QWORD PTR [r11+8] + mov rax, QWORD PTR [r11+32] + xor rax, QWORD PTR [r11] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r8+48] + xor rcx, QWORD PTR [r8+16] + movd xmm4, rax + and eax, 2097136 + punpcklqdq xmm4, xmm0 + movd xmm0, rdx + mov rdx, QWORD PTR [r9+56] + xor rdx, QWORD PTR [r9+24] + movd xmm5, rcx + mov rcx, QWORD PTR [r9+48] + xor rcx, QWORD PTR [r9+16] + punpcklqdq xmm5, xmm0 + movd xmm0, rdx + mov rdx, QWORD PTR [r10+56] + xor rdx, QWORD PTR [r10+24] + movd xmm6, rcx + mov rcx, QWORD PTR [r10+48] + xor rcx, QWORD PTR [r10+16] + punpcklqdq xmm6, xmm0 + movd xmm0, rdx + mov rdx, QWORD PTR [r11+56] + movd xmm7, rcx + punpcklqdq xmm7, xmm0 + xor rdx, QWORD PTR [r11+24] + mov rcx, QWORD PTR [r11+48] + xor rcx, QWORD PTR [r11+16] + mov r11d, 524288 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + movdqu xmm10, XMMWORD PTR [r14+rdi] + movd xmm0, rdx + movdqu xmm11, XMMWORD PTR [r15+rsi] + movdqu xmm12, XMMWORD PTR [r12+rax] + movd xmm8, rcx + punpcklqdq xmm8, xmm0 + + ALIGN(64) +main_loop_cnv1_quad: + aesenc xmm9, xmm1 + aesenc xmm10, xmm2 + aesenc xmm11, xmm3 + aesenc xmm12, xmm4 + movd ecx, xmm9 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+rbp] + movd ecx, xmm10 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r14] + movd ecx, xmm11 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r15] + movd ecx, xmm12 + and ecx, 2097136 + prefetcht0 BYTE PTR [rcx+r12] + movdqa xmm0, xmm9 + pxor xmm0, xmm5 + movdqa xmm5, xmm9 + movd QWORD PTR [rbp+rbx], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm10 + shr rcx, 24 + pxor xmm0, xmm6 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [rbp+rbx+8], rcx + movd rbx, xmm1 + movd QWORD PTR [r14+rdi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm11 + shr rcx, 24 + pxor xmm0, xmm7 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r14+rdi+8], rcx + movd rdi, xmm2 + movd QWORD PTR [r15+rsi], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + movdqa xmm0, xmm12 + shr rcx, 24 + pxor xmm0, xmm8 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r15+rsi+8], rcx + movd QWORD PTR [r12+rax], xmm0 + pextrq rdx, xmm0, 1 + mov ecx, edx + shr rcx, 24 + mov ecx, DWORD PTR [r13+rcx*4] + xor rcx, rdx + mov QWORD PTR [r12+rax+8], rcx + movd rcx, xmm9 + mov r8, rcx + and r8d, 2097136 + mov r9, QWORD PTR [rbp+r8] + mov r10, QWORD PTR [rbp+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm1, 1 + add rcx, rax + add rbx, rdx + mov rax, QWORD PTR [rsp] + mov QWORD PTR [rbp+r8], rbx + xor rax, rcx + mov QWORD PTR [rbp+r8+8], rax + xor rcx, r10 + xor rbx, r9 + movd xmm1, rbx + and ebx, 2097136 + pinsrq xmm1, rcx, 1 + movd rcx, xmm10 + mov r8, rcx + and r8d, 2097136 + movdqu xmm9, XMMWORD PTR [rbp+rbx] + mov r9, QWORD PTR [r14+r8] + mov r10, QWORD PTR [r14+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm2, 1 + add rcx, rax + add rdi, rdx + mov rax, QWORD PTR [rsp+8] + mov QWORD PTR [r14+r8], rdi + xor rax, rcx + xor rdi, r9 + mov QWORD PTR [r14+r8+8], rax + xor rcx, r10 + movd xmm2, rdi + and edi, 2097136 + pinsrq xmm2, rcx, 1 + movd rcx, xmm11 + movd rsi, xmm3 + mov r8, rcx + and r8d, 2097136 + movdqa xmm6, xmm10 + movdqa xmm7, xmm11 + movdqa xmm8, xmm12 + movdqu xmm10, XMMWORD PTR [r14+rdi] + mov r9, QWORD PTR [r15+r8] + mov r10, QWORD PTR [r15+r8+8] + mov rax, r9 + mul rcx + pextrq rcx, xmm3, 1 + add rcx, rax + add rsi, rdx + mov rax, QWORD PTR [rsp+16] + xor rax, rcx + mov QWORD PTR [r15+r8], rsi + mov QWORD PTR [r15+r8+8], rax + xor rcx, r10 + xor rsi, r9 + movd xmm3, rsi + and esi, 2097136 + pinsrq xmm3, rcx, 1 + movd rcx, xmm12 + mov r8, rcx + and r8d, 2097136 + movdqu xmm11, XMMWORD PTR [r15+rsi] + mov r9, QWORD PTR [r12+r8] + mov r10, QWORD PTR [r12+r8+8] + mov rax, r9 + mul rcx + mov rcx, rax + movd rax, xmm4 + add rax, rdx + mov QWORD PTR [r12+r8], rax + xor rax, r9 + pextrq rdx, xmm4, 1 + add rdx, rcx + mov rcx, QWORD PTR [rsp+24] + xor rcx, rdx + xor rdx, r10 + movd xmm4, rax + mov QWORD PTR [r12+r8+8], rcx + and eax, 2097136 + pinsrq xmm4, rdx, 1 + movdqu xmm12, XMMWORD PTR [r12+rax] + sub r11, 1 + jne main_loop_cnv1_quad + + movaps xmm7, XMMWORD PTR [rsp+112] + lea r11, QWORD PTR [rsp+144] + mov rbx, QWORD PTR [r11+48] + mov rbp, QWORD PTR [r11+56] + mov rsi, QWORD PTR [r11+64] + movaps xmm6, XMMWORD PTR [r11-16] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm9, XMMWORD PTR [r11-64] + movaps xmm10, XMMWORD PTR [r11-80] + movaps xmm11, XMMWORD PTR [r11-96] + movaps xmm12, XMMWORD PTR [r11-112] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi diff --git a/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc b/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc new file mode 100644 index 00000000..37413f23 --- /dev/null +++ b/src/crypto/cn/asm/win64/cn1/cnv1_single_main_loop.inc @@ -0,0 +1,66 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r13 + push r14 + push r15 + mov rdx, QWORD PTR [rcx] + mov esi, 524288 + mov r11, QWORD PTR [rdx+32] + xor r11, QWORD PTR [rdx] + mov rdi, QWORD PTR [rdx+224] + mov rbx, QWORD PTR [rdx+40] + xor rbx, QWORD PTR [rdx+8] + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + mov rbp, QWORD PTR [rdx+240] + mov r14, QWORD PTR [rdx+232] + movd xmm2, rax + pinsrq xmm2, rcx, 1 + + ALIGN(64) +main_loop_cnv1_single: + mov r8, r11 + and r8d, 2097136 + movdqu xmm1, XMMWORD PTR [rdi+r8] + movd xmm0, r11 + pinsrq xmm0, rbx, 1 + aesenc xmm1, xmm0 + movd r15, xmm1 + mov r9, r15 + and r9d, 2097136 + movdqa xmm0, xmm1 + pxor xmm0, xmm2 + movdqa xmm2, xmm1 + movd QWORD PTR [rdi+r8], xmm0 + pextrq rdx, xmm0, 1 + mov eax, edx + shr rax, 24 + mov ecx, DWORD PTR [r14+rax*4] + xor rcx, rdx + mov QWORD PTR [rdi+r8+8], rcx + mov r10, QWORD PTR [rdi+r9] + mov r8, QWORD PTR [rdi+r9+8] + mov rax, r10 + mul r15 + add rbx, rax + add r11, rdx + mov QWORD PTR [rdi+r9], r11 + mov rax, rbx + xor rbx, r8 + xor r11, r10 + xor rax, rbp + mov QWORD PTR [rdi+r9+8], rax + sub rsi, 1 + jne main_loop_cnv1_single + + pop r15 + pop r14 + pop r13 + mov rbx, QWORD PTR [rsp+8] + mov rbp, QWORD PTR [rsp+16] + mov rsi, QWORD PTR [rsp+24] + mov rdi, QWORD PTR [rsp+32] diff --git a/src/crypto/cn/asm/win64/cn_main_loop.S b/src/crypto/cn/asm/win64/cn_main_loop.S index 9361469a..9a227f0a 100644 --- a/src/crypto/cn/asm/win64/cn_main_loop.S +++ b/src/crypto/cn/asm/win64/cn_main_loop.S @@ -1,6 +1,9 @@ #define ALIGN(x) .align 64 .intel_syntax noprefix .section .text +.global cnv1_single_mainloop_asm +.global cnv1_double_mainloop_asm +.global cnv1_quad_mainloop_asm .global cnv2_mainloop_ivybridge_asm .global cnv2_mainloop_ryzen_asm .global cnv2_mainloop_bulldozer_asm @@ -9,6 +12,24 @@ .global cnv2_rwz_double_mainloop_asm .global cnv2_upx_double_mainloop_zen3_asm +ALIGN(64) +cnv1_single_mainloop_asm: + #include "../cn1/cnv1_single_main_loop.inc" + ret 0 + mov eax, 3735929054 + +ALIGN(64) +cnv1_double_mainloop_asm: + #include "../cn1/cnv1_double_main_loop.inc" + ret 0 + mov eax, 3735929054 + +ALIGN(64) +cnv1_quad_mainloop_asm: + #include "../cn1/cnv1_quad_main_loop.inc" + ret 0 + mov eax, 3735929054 + ALIGN(64) cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" diff --git a/src/crypto/cn/asm/win64/cn_main_loop.asm b/src/crypto/cn/asm/win64/cn_main_loop.asm index 7f83e682..0979580a 100644 --- a/src/crypto/cn/asm/win64/cn_main_loop.asm +++ b/src/crypto/cn/asm/win64/cn_main_loop.asm @@ -1,4 +1,7 @@ _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cnv1_single_mainloop_asm +PUBLIC cnv1_double_mainloop_asm +PUBLIC cnv1_quad_mainloop_asm PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm @@ -6,28 +9,49 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm PUBLIC cnv2_rwz_mainloop_asm PUBLIC cnv2_rwz_double_mainloop_asm -ALIGN 64 +ALIGN(64) +cnv1_single_mainloop_asm PROC + INCLUDE cn1/cnv1_single_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_single_mainloop_asm ENDP + +ALIGN(64) +cnv1_double_mainloop_asm PROC + INCLUDE cn1/cnv1_double_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_double_mainloop_asm ENDP + +ALIGN(64) +cnv1_quad_mainloop_asm PROC + INCLUDE cn1/cnv1_quad_main_loop.inc + ret 0 + mov eax, 3735929054 +cnv1_quad_mainloop_asm ENDP + +ALIGN(64) cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 mov eax, 3735929054 cnv2_mainloop_ivybridge_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 mov eax, 3735929054 cnv2_mainloop_ryzen_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 mov eax, 3735929054 cnv2_mainloop_bulldozer_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 diff --git a/src/crypto/ghostrider/CMakeLists.txt b/src/crypto/ghostrider/CMakeLists.txt index f76fe649..a80396b8 100644 --- a/src/crypto/ghostrider/CMakeLists.txt +++ b/src/crypto/ghostrider/CMakeLists.txt @@ -42,14 +42,40 @@ set(SOURCES ghostrider.cpp ) -if (CMAKE_C_COMPILER_ID MATCHES GNU) - # gcc 11.2.0 crashes with -ftree-vrp - set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "-Ofast -fno-tree-vrp") - - # gcc 11.2.0 creates incorrect code with -O3 - set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "-O2") - - set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "-Ofast -Wno-unused-const-variable") +if (CMAKE_C_COMPILER_ID MATCHES MSVC) + set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") + set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "/O1 /Oi /Os") +elseif (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) + set_source_files_properties(sph_blake.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_bmw.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_cubehash.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_echo.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_fugue.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_groestl.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_hamsi.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_jh.c PROPERTIES COMPILE_FLAGS "-Os -fno-tree-vrp") + set_source_files_properties(sph_keccak.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_luffa.c PROPERTIES COMPILE_FLAGS "-Os -Wno-unused-const-variable") + set_source_files_properties(sph_shabal.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_shavite.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_simd.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_sha2.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_skein.c PROPERTIES COMPILE_FLAGS "-Os") + set_source_files_properties(sph_whirlpool.c PROPERTIES COMPILE_FLAGS "-Os") endif() include_directories(.) diff --git a/src/crypto/ghostrider/README.md b/src/crypto/ghostrider/README.md index e91fc5bd..b1614abd 100644 --- a/src/crypto/ghostrider/README.md +++ b/src/crypto/ghostrider/README.md @@ -4,8 +4,6 @@ No tuning is required - auto-config works well on most CPUs! -**Note for Windows users: MSVC binary is ~5% faster than GCC binary!** - ### Sample command line (non-SSL port) ``` xmrig -a gr -o raptoreumemporium.com:3008 -u WALLET_ADDRESS @@ -26,16 +24,16 @@ While individual algorithm implementations are a bit unoptimized, XMRig achieves For the same reason, XMRig can sometimes use less than 100% CPU on Ryzen 3000/5000 CPUs if it finds that running 1 thread per core is faster for some Cryptonight variants on your system. -**Windows** (detailed results [here](https://imgur.com/a/uRU1yO2)) -CPU|cpuminer-gr-avx2 (tuned), h/s|XMRig (MSVC build), h/s|Speedup +**Windows** (detailed results [here](https://imgur.com/a/0njIVVW)) +CPU|cpuminer-gr-avx2 1.2.4.1 (tuned), h/s|XMRig v6.16.2 (MSVC build), h/s|Speedup -|-|-|- -AMD Ryzen 7 4700U|632.6|731|+15.5% -Intel Core i7-2600|496.4|533.6|+7.5% -AMD Ryzen 7 3700X @ 4.1 GHz|2453.0|2469.1|+0.65% -AMD Ryzen 5 5600X @ 4.65 GHz|2112.6|2313.2|+9.5% +AMD Ryzen 7 4700U|632.6|733.1|+15.89% +Intel Core i7-2600|496.4|554.6|+11.72% +AMD Ryzen 7 3700X @ 4.1 GHz|2453.0|2496.5|+1.77% +AMD Ryzen 5 5600X @ 4.65 GHz|2112.6|2337.5|+10.65% -**Linux** (tested by **Delgon**, detailed results [here](https://cdn.discordapp.com/attachments/604375870236524574/913167614749048872/unknown.png)) -CPU|cpuminer-gr-avx2 (tuned), h/s|XMRig (GCC build), h/s|Speedup +**Linux (outdated)** (tested by **Delgon**, detailed results [here](https://cdn.discordapp.com/attachments/604375870236524574/913167614749048872/unknown.png)) +CPU|cpuminer-gr-avx2 1.2.4.1 (tuned), h/s|XMRig v6.16.0 (GCC build), h/s|Speedup -|-|-|- AMD Ryzen 9 3900X|3746.51|3604.89|-3.78% 2xIntel Xeon E5-2698v3|2563.4|2638.38|+2.925% diff --git a/src/crypto/ghostrider/ghostrider.cpp b/src/crypto/ghostrider/ghostrider.cpp index a23150fc..9f403d3a 100644 --- a/src/crypto/ghostrider/ghostrider.cpp +++ b/src/crypto/ghostrider/ghostrider.cpp @@ -36,6 +36,7 @@ #include "base/io/log/Log.h" #include "base/io/log/Tags.h" +#include "base/tools/Chrono.h" #include "backend/cpu/Cpu.h" #include "crypto/cn/CnHash.h" #include "crypto/cn/CnCtx.h" @@ -44,7 +45,6 @@ #include #include -#include #include #ifdef XMRIG_FEATURE_HWLOC @@ -328,8 +328,6 @@ void benchmark() LOG_VERBOSE("%24s | N | Hashrate", "Algorithm"); LOG_VERBOSE("-------------------------|-----|-------------"); - using namespace std::chrono; - for (uint32_t algo = 0; algo < 6; ++algo) { for (uint64_t step : { 1, 2, 4}) { const size_t cur_scratchpad_size = cn_sizes[algo] * step; @@ -339,26 +337,26 @@ void benchmark() auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO); - const high_resolution_clock::time_point start_time = high_resolution_clock::now(); + double start_time = Chrono::highResolutionMSecs(); double min_dt = 1e10; for (uint32_t iter = 0;; ++iter) { - const high_resolution_clock::time_point t1 = high_resolution_clock::now(); + double t1 = Chrono::highResolutionMSecs(); // Stop after 15 milliseconds, but only if at least 10 iterations were done - if ((iter >= 10) && (duration_cast(t1 - start_time).count() >= 15)) { + if ((iter >= 10) && (t1 - start_time >= 15.0)) { break; } f(buf, sizeof(buf), hash, ctx, 0); - const double dt = duration_cast(high_resolution_clock::now() - t1).count() / 1e9; + const double dt = Chrono::highResolutionMSecs() - t1; if (dt < min_dt) { min_dt = dt; } } - const double hashrate = step / min_dt; + const double hashrate = step * 1e3 / min_dt; LOG_VERBOSE("%24s | %" PRIu64 "x1 | %.2f h/s", cn_names[algo], step, hashrate); if (hashrate > tune8MB[algo].hashrate) { @@ -388,14 +386,14 @@ void benchmark() auto f = CnHash::fn(cn_hash[algo], av[step], Assembly::AUTO); - const high_resolution_clock::time_point start_time = high_resolution_clock::now(); + double start_time = Chrono::highResolutionMSecs(); double min_dt = 1e10; for (uint32_t iter = 0;; ++iter) { - const high_resolution_clock::time_point t1 = high_resolution_clock::now(); + double t1 = Chrono::highResolutionMSecs(); // Stop after 30 milliseconds, but only if at least 10 iterations were done - if ((iter >= 10) && (duration_cast(t1 - start_time).count() >= 30)) { + if ((iter >= 10) && (t1 - start_time >= 30.0)) { break; } @@ -403,13 +401,13 @@ void benchmark() f(buf, sizeof(buf), hash, ctx, 0); helper->wait(); - const double dt = duration_cast(high_resolution_clock::now() - t1).count() / 1e9; + const double dt = Chrono::highResolutionMSecs() - t1; if (dt < min_dt) { min_dt = dt; } } - const double hashrate = step * 2.0 / min_dt * 1.0075; + const double hashrate = step * 2e3 / min_dt * 1.0075; LOG_VERBOSE("%24s | %" PRIu64 "x2 | %.2f h/s", cn_names[algo], step, hashrate); if (hashrate > tune8MB[algo].hashrate) { diff --git a/src/crypto/randomx/aes_hash.cpp b/src/crypto/randomx/aes_hash.cpp index 2383dab4..ee5989e1 100644 --- a/src/crypto/randomx/aes_hash.cpp +++ b/src/crypto/randomx/aes_hash.cpp @@ -382,7 +382,7 @@ void SelectSoftAESImpl(size_t threadsCount) double fast_speed = 0.0; for (size_t run = 0; run < 3; ++run) { for (size_t i = 0; i < impl.size(); ++i) { - const uint64_t t1 = xmrig::Chrono::highResolutionMSecs(); + const double t1 = xmrig::Chrono::highResolutionMSecs(); std::vector count(threadsCount, 0); std::vector threads; for (size_t t = 0; t < threadsCount; ++t) { @@ -401,7 +401,7 @@ void SelectSoftAESImpl(size_t threadsCount) threads[t].join(); total += count[t]; } - const uint64_t t2 = xmrig::Chrono::highResolutionMSecs(); + const double t2 = xmrig::Chrono::highResolutionMSecs(); const double speed = total * 1e3 / (t2 - t1); if (speed > fast_speed) { fast_idx = i; diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp index b7a3ecaf..ed520aa8 100644 --- a/src/crypto/randomx/randomx.cpp +++ b/src/crypto/randomx/randomx.cpp @@ -295,7 +295,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx INST_HANDLE(IMUL_R, ISUB_M); INST_HANDLE(IMUL_M, IMUL_R); -#if defined(_M_X64) || defined(__x86_64__) +#if defined(XMRIG_FEATURE_ASM) && (defined(_M_X64) || defined(__x86_64__)) if (hasBMI2) { INST_HANDLE2(IMULH_R, IMULH_R_BMI2, IMUL_M); INST_HANDLE2(IMULH_M, IMULH_M_BMI2, IMULH_R); @@ -337,7 +337,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx INST_HANDLE(CBRANCH, FSQRT_R); #endif -#if defined(_M_X64) || defined(__x86_64__) +#if defined(XMRIG_FEATURE_ASM) && (defined(_M_X64) || defined(__x86_64__)) if (hasBMI2) { INST_HANDLE2(CFROUND, CFROUND_BMI2, CBRANCH); } diff --git a/src/version.h b/src/version.h index 3548516d..46a477d8 100644 --- a/src/version.h +++ b/src/version.h @@ -22,7 +22,7 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "XMRig miner" -#define APP_VERSION "6.16.1" +#define APP_VERSION "6.16.2-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2021 xmrig.com" @@ -30,7 +30,7 @@ #define APP_VER_MAJOR 6 #define APP_VER_MINOR 16 -#define APP_VER_PATCH 1 +#define APP_VER_PATCH 2 #ifdef _MSC_VER # if (_MSC_VER >= 1920)