diff --git a/CHANGELOG.md b/CHANGELOG.md index 2fb6dc50..8827e7e8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,13 @@ +# v6.18.1 +- [#3129](https://github.com/xmrig/xmrig/pull/3129) Fix: protectRX flushed CPU cache only on MacOS/iOS. +- [#3126](https://github.com/xmrig/xmrig/pull/3126) Don't reset when pool sends the same job blob. +- [#3120](https://github.com/xmrig/xmrig/pull/3120) RandomX: optimized `CFROUND` elimination. +- [#3109](https://github.com/xmrig/xmrig/pull/3109) RandomX: added Blake2 AVX2 version. +- [#3082](https://github.com/xmrig/xmrig/pull/3082) Fixed GCC 12 warnings. +- [#3075](https://github.com/xmrig/xmrig/pull/3075) Recognize `armv7ve` as valid ARMv7 target. +- [#3132](https://github.com/xmrig/xmrig/pull/3132) RandomX: added MSR mod for Zen 4. +- [#3134](https://github.com/xmrig/xmrig/pull/3134) Added Zen4 to `randomx_boost.sh`. + # v6.18.0 - [#3067](https://github.com/xmrig/xmrig/pull/3067) Monero v15 network upgrade support and more house keeping. - Removed deprecated AstroBWTv1 and v2. diff --git a/CMakeLists.txt b/CMakeLists.txt index d1bc2f0a..52d6d189 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,6 +29,7 @@ option(WITH_INTERLEAVE_DEBUG_LOG "Enable debug log for threads interleave" OFF) option(WITH_MO_BENCHMARK "Enable Benchmark module and algo-perf feature (for MoneroOcean)" ON) option(WITH_PROFILING "Enable profiling for developers" OFF) option(WITH_SSE4_1 "Enable SSE 4.1 for Blake2" ON) +option(WITH_AVX2 "Enable AVX2 for Blake2" ON) option(WITH_VAES "Enable VAES instructions for Cryptonight" ON) option(WITH_BENCHMARK "Enable builtin RandomX benchmark and stress test" ON) option(WITH_SECURE_JIT "Enable secure access to JIT memory" OFF) diff --git a/README.md b/README.md index f4d73978..9d25e34e 100644 --- a/README.md +++ b/README.md @@ -7,7 +7,7 @@ [![GitHub stars](https://img.shields.io/github/stars/xmrig/xmrig.svg)](https://github.com/MoneroOcean/xmrig/stargazers) [![GitHub forks](https://img.shields.io/github/forks/xmrig/xmrig.svg)](https://github.com/MoneroOcean/xmrig/network) -XMRig is a high performance, open source, cross platform RandomX, KawPow, CryptoNight, AstroBWT and [GhostRider](https://github.com/xmrig/xmrig/tree/master/src/crypto/ghostrider#readme) unified CPU/GPU miner and [RandomX benchmark](https://xmrig.com/benchmark). Official binaries are available for Windows, Linux, macOS and FreeBSD. +XMRig is a high performance, open source, cross platform RandomX, KawPow, CryptoNight and [GhostRider](https://github.com/xmrig/xmrig/tree/master/src/crypto/ghostrider#readme) unified CPU/GPU miner and [RandomX benchmark](https://xmrig.com/benchmark). Official binaries are available for Windows, Linux, macOS and FreeBSD. ## Mining backends - **CPU** (x64/ARMv7/ARMv8) diff --git a/cmake/cpu.cmake b/cmake/cpu.cmake index 4a661b23..cbcd8b01 100644 --- a/cmake/cpu.cmake +++ b/cmake/cpu.cmake @@ -25,13 +25,14 @@ if (XMRIG_64_BIT AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64)$") add_definitions(-DRAPIDJSON_SSE2) else() set(WITH_SSE4_1 OFF) + set(WITH_AVX2 OFF) set(WITH_VAES OFF) endif() if (NOT ARM_TARGET) if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv8-a)$") set(ARM_TARGET 8) - elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l)$") + elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(armv7|armv7f|armv7s|armv7k|armv7-a|armv7l|armv7ve)$") set(ARM_TARGET 7) endif() endif() @@ -57,3 +58,7 @@ endif() if (WITH_SSE4_1) add_definitions(-DXMRIG_FEATURE_SSE4_1) endif() + +if (WITH_AVX2) + add_definitions(-DXMRIG_FEATURE_AVX2) +endif() diff --git a/cmake/randomx.cmake b/cmake/randomx.cmake index a702a868..69218f0f 100644 --- a/cmake/randomx.cmake +++ b/cmake/randomx.cmake @@ -83,7 +83,15 @@ if (WITH_RANDOMX) list(APPEND SOURCES_CRYPTO src/crypto/randomx/blake2/blake2b_sse41.c) if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) - set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS -msse4.1) + set_source_files_properties(src/crypto/randomx/blake2/blake2b_sse41.c PROPERTIES COMPILE_FLAGS "-Ofast -msse4.1") + endif() + endif() + + if (WITH_AVX2) + list(APPEND SOURCES_CRYPTO src/crypto/randomx/blake2/avx2/blake2b_avx2.c) + + if (CMAKE_C_COMPILER_ID MATCHES GNU OR CMAKE_C_COMPILER_ID MATCHES Clang) + set_source_files_properties(src/crypto/randomx/blake2/avx2/blake2b_avx2.c PROPERTIES COMPILE_FLAGS "-Ofast -mavx2") endif() endif() diff --git a/scripts/randomx_boost.sh b/scripts/randomx_boost.sh index 3b60959d..f9f3ac2e 100755 --- a/scripts/randomx_boost.sh +++ b/scripts/randomx_boost.sh @@ -10,14 +10,24 @@ fi if grep -E 'AMD Ryzen|AMD EPYC' /proc/cpuinfo > /dev/null; then - if grep "cpu family[[:space:]]:[[:space:]]25" /proc/cpuinfo > /dev/null; + if grep "cpu family[[:space:]]\{1,\}:[[:space:]]25" /proc/cpuinfo > /dev/null; then - echo "Detected Zen3 CPU" - wrmsr -a 0xc0011020 0x4480000000000 - wrmsr -a 0xc0011021 0x1c000200000040 - wrmsr -a 0xc0011022 0xc000000401500000 - wrmsr -a 0xc001102b 0x2000cc14 - echo "MSR register values for Zen3 applied" + if grep "model[[:space:]]\{1,\}:[[:space:]]97" /proc/cpuinfo > /dev/null; + then + echo "Detected Zen4 CPU" + wrmsr -a 0xc0011020 0x4400000000000 + wrmsr -a 0xc0011021 0x4000000000040 + wrmsr -a 0xc0011022 0x8680000401570000 + wrmsr -a 0xc001102b 0x2040cc10 + echo "MSR register values for Zen4 applied" + else + echo "Detected Zen3 CPU" + wrmsr -a 0xc0011020 0x4480000000000 + wrmsr -a 0xc0011021 0x1c000200000040 + wrmsr -a 0xc0011022 0xc000000401500000 + wrmsr -a 0xc001102b 0x2000cc14 + echo "MSR register values for Zen3 applied" + fi else echo "Detected Zen1/Zen2 CPU" wrmsr -a 0xc0011020 0 diff --git a/src/backend/cpu/CpuWorker.cpp b/src/backend/cpu/CpuWorker.cpp index 2214a009..4f75ae8c 100644 --- a/src/backend/cpu/CpuWorker.cpp +++ b/src/backend/cpu/CpuWorker.cpp @@ -77,8 +77,11 @@ xmrig::CpuWorker::CpuWorker(size_t id, const CpuLaunchData &data) : { # ifdef XMRIG_ALGO_CN_HEAVY // cn-heavy optimization for Zen3 CPUs - const bool is_vermeer = (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) && (Cpu::info()->model() == 0x21); - if ((N == 1) && (m_av == CnHash::AV_SINGLE) && (m_algorithm.family() == Algorithm::CN_HEAVY) && (m_assembly != Assembly::NONE) && is_vermeer) { + const auto arch = Cpu::info()->arch(); + const uint32_t model = Cpu::info()->model(); + const bool is_vermeer = (arch == ICpuInfo::ARCH_ZEN3) && (model == 0x21); + const bool is_raphael = (arch == ICpuInfo::ARCH_ZEN4) && (model == 0x61); + if ((N == 1) && (m_av == CnHash::AV_SINGLE) && (m_algorithm.family() == Algorithm::CN_HEAVY) && (m_assembly != Assembly::NONE) && (is_vermeer || is_raphael)) { std::lock_guard lock(cn_heavyZen3MemoryMutex); if (!cn_heavyZen3Memory) { // Round up number of threads to the multiple of 8 diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index f2b56009..387f319b 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -45,19 +45,21 @@ public: ARCH_ZEN, ARCH_ZEN_PLUS, ARCH_ZEN2, - ARCH_ZEN3 + ARCH_ZEN3, + ARCH_ZEN4 }; enum MsrMod : uint32_t { MSR_MOD_NONE, MSR_MOD_RYZEN_17H, MSR_MOD_RYZEN_19H, + MSR_MOD_RYZEN_19H_ZEN4, MSR_MOD_INTEL, MSR_MOD_CUSTOM, MSR_MOD_MAX }; -# define MSR_NAMES_LIST "none", "ryzen_17h", "ryzen_19h", "intel", "custom" +# define MSR_NAMES_LIST "none", "ryzen_17h", "ryzen_19h", "ryzen_19h_zen4", "intel", "custom" enum Flag : uint32_t { FLAG_AES, diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index a565c6dc..1e377f7f 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -64,7 +64,7 @@ static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX m #ifdef XMRIG_FEATURE_MSR -constexpr size_t kMsrArraySize = 5; +constexpr size_t kMsrArraySize = 6; static const std::array msrNames = { MSR_NAMES_LIST }; static_assert(kMsrArraySize == ICpuInfo::MSR_MOD_MAX, "kMsrArraySize and MSR_MOD_MAX mismatch"); #endif @@ -250,8 +250,14 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : break; case 0x19: - m_arch = ARCH_ZEN3; - m_msrMod = MSR_MOD_RYZEN_19H; + if (m_model == 0x61) { + m_arch = ARCH_ZEN4; + m_msrMod = MSR_MOD_RYZEN_19H_ZEN4; + } + else { + m_arch = ARCH_ZEN3; + m_msrMod = MSR_MOD_RYZEN_19H; + } break; default: diff --git a/src/base/net/stratum/DaemonClient.cpp b/src/base/net/stratum/DaemonClient.cpp index 821045e9..f383308c 100644 --- a/src/base/net/stratum/DaemonClient.cpp +++ b/src/base/net/stratum/DaemonClient.cpp @@ -66,7 +66,6 @@ Storage DaemonClient::m_storage; static const char* kBlocktemplateBlob = "blocktemplate_blob"; static const char* kBlockhashingBlob = "blockhashing_blob"; -static const char* kLastError = "lasterror"; static const char *kGetHeight = "/getheight"; static const char *kGetInfo = "/getinfo"; static const char *kHash = "hash"; diff --git a/src/base/net/stratum/Job.cpp b/src/base/net/stratum/Job.cpp index 7f5d8d3f..56f5de80 100644 --- a/src/base/net/stratum/Job.cpp +++ b/src/base/net/stratum/Job.cpp @@ -48,7 +48,13 @@ xmrig::Job::Job(bool nicehash, const Algorithm &algorithm, const String &clientI bool xmrig::Job::isEqual(const Job &other) const { - return m_id == other.m_id && m_clientId == other.m_clientId && memcmp(m_blob, other.m_blob, sizeof(m_blob)) == 0 && m_target == other.m_target; + return m_id == other.m_id && m_clientId == other.m_clientId && isEqualBlob(other) && m_target == other.m_target; +} + + +bool xmrig::Job::isEqualBlob(const Job &other) const +{ + return (m_size == other.m_size) && (memcmp(m_blob, other.m_blob, m_size) == 0); } @@ -58,19 +64,19 @@ bool xmrig::Job::setBlob(const char *blob) return false; } - m_size = strlen(blob); - if (m_size % 2 != 0) { + size_t size = strlen(blob); + if (size % 2 != 0) { return false; } - m_size /= 2; + size /= 2; const size_t minSize = nonceOffset() + nonceSize(); - if (m_size < minSize || m_size >= sizeof(m_blob)) { + if (size < minSize || size >= sizeof(m_blob)) { return false; } - if (!Cvt::fromHex(m_blob, sizeof(m_blob), blob, m_size * 2)) { + if (!Cvt::fromHex(m_blob, sizeof(m_blob), blob, size * 2)) { return false; } @@ -80,9 +86,10 @@ bool xmrig::Job::setBlob(const char *blob) # ifdef XMRIG_PROXY_PROJECT memset(m_rawBlob, 0, sizeof(m_rawBlob)); - memcpy(m_rawBlob, blob, m_size * 2); + memcpy(m_rawBlob, blob, size * 2); # endif + m_size = size; return true; } diff --git a/src/base/net/stratum/Job.h b/src/base/net/stratum/Job.h index 3fb31baa..e314a266 100644 --- a/src/base/net/stratum/Job.h +++ b/src/base/net/stratum/Job.h @@ -59,6 +59,7 @@ public: ~Job() = default; bool isEqual(const Job &other) const; + bool isEqualBlob(const Job &other) const; bool setBlob(const char *blob); bool setSeedHash(const char *hash); bool setTarget(const char *target); diff --git a/src/core/Miner.cpp b/src/core/Miner.cpp index a892345d..9cc9092b 100644 --- a/src/core/Miner.cpp +++ b/src/core/Miner.cpp @@ -561,6 +561,12 @@ void xmrig::Miner::setJob(const Job &job, bool donate) const uint8_t index = donate ? 1 : 0; d_ptr->reset = !(d_ptr->job.index() == 1 && index == 0 && d_ptr->userJobId == job.id()); + + // Don't reset nonce if pool sends the same hashing blob again, but with different difficulty (for example) + if (d_ptr->job.isEqualBlob(job)) { + d_ptr->reset = false; + } + d_ptr->job = job; d_ptr->job.setIndex(index); diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp index 7b03f7bc..5d12bc55 100644 --- a/src/crypto/cn/CnHash.cpp +++ b/src/crypto/cn/CnHash.cpp @@ -413,8 +413,12 @@ xmrig::cn_hash_fun xmrig::CnHash::fn(const Algorithm &algorithm, AlgoVariant av, } # ifdef XMRIG_ALGO_CN_HEAVY - // cn-heavy optimization for Zen3 CPUs - if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) && (Cpu::info()->model() == 0x21)) { + // cn-heavy optimization for Zen3/Zen4 CPUs + const auto arch = Cpu::info()->arch(); + const uint32_t model = Cpu::info()->model(); + const bool is_vermeer = (arch == ICpuInfo::ARCH_ZEN3) && (model == 0x21); + const bool is_raphael = (arch == ICpuInfo::ARCH_ZEN4) && (model == 0x61); + if ((av == AV_SINGLE) && (assembly != Assembly::NONE) && (is_vermeer || is_raphael)) { switch (algorithm.id()) { case Algorithm::CN_HEAVY_0: return cryptonight_single_hash; diff --git a/src/crypto/common/VirtualMemory_unix.cpp b/src/crypto/common/VirtualMemory_unix.cpp index edda231f..12f4f25f 100644 --- a/src/crypto/common/VirtualMemory_unix.cpp +++ b/src/crypto/common/VirtualMemory_unix.cpp @@ -112,13 +112,19 @@ bool xmrig::VirtualMemory::protectRWX(void *p, size_t size) bool xmrig::VirtualMemory::protectRX(void *p, size_t size) { + bool result = true; + # if defined(XMRIG_OS_APPLE) && defined(XMRIG_ARM) pthread_jit_write_protect_np(true); - flushInstructionCache(p, size); - return true; # else - return mprotect(p, size, PROT_READ | PROT_EXEC) == 0; + result = (mprotect(p, size, PROT_READ | PROT_EXEC) == 0); # endif + +# if defined(XMRIG_ARM) + flushInstructionCache(p, size); +# endif + + return result; } diff --git a/src/crypto/randomx/blake2/avx2/LICENSE b/src/crypto/randomx/blake2/avx2/LICENSE new file mode 100644 index 00000000..0e259d42 --- /dev/null +++ b/src/crypto/randomx/blake2/avx2/LICENSE @@ -0,0 +1,121 @@ +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. diff --git a/src/crypto/randomx/blake2/avx2/blake2.h b/src/crypto/randomx/blake2/avx2/blake2.h new file mode 100644 index 00000000..a1ea671f --- /dev/null +++ b/src/crypto/randomx/blake2/avx2/blake2.h @@ -0,0 +1,38 @@ +#ifndef BLAKE2_AVX2_BLAKE2_H +#define BLAKE2_AVX2_BLAKE2_H + +#if !defined(__cplusplus) && (!defined(__STDC_VERSION__) || __STDC_VERSION__ < 199901L) + #if defined(_MSC_VER) + #define INLINE __inline + #elif defined(__GNUC__) + #define INLINE __inline__ + #else + #define INLINE + #endif +#else + #define INLINE inline +#endif + +#if defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned(x))) +#endif + +enum blake2s_constant { + BLAKE2S_BLOCKBYTES = 64, + BLAKE2S_OUTBYTES = 32, + BLAKE2S_KEYBYTES = 32, + BLAKE2S_SALTBYTES = 8, + BLAKE2S_PERSONALBYTES = 8 +}; + +enum blake2b_constant { + BLAKE2B_BLOCKBYTES = 128, + BLAKE2B_OUTBYTES = 64, + BLAKE2B_KEYBYTES = 64, + BLAKE2B_SALTBYTES = 16, + BLAKE2B_PERSONALBYTES = 16 +}; + +#endif diff --git a/src/crypto/randomx/blake2/avx2/blake2b-common.h b/src/crypto/randomx/blake2/avx2/blake2b-common.h new file mode 100644 index 00000000..f9803ee2 --- /dev/null +++ b/src/crypto/randomx/blake2/avx2/blake2b-common.h @@ -0,0 +1,48 @@ +#ifndef BLAKE2_AVX2_BLAKE2B_COMMON_H +#define BLAKE2_AVX2_BLAKE2B_COMMON_H + +#include +#include +#include + +#include + +#include "blake2.h" + +#define LOAD128(p) _mm_load_si128( (__m128i *)(p) ) +#define STORE128(p,r) _mm_store_si128((__m128i *)(p), r) + +#define LOADU128(p) _mm_loadu_si128( (__m128i *)(p) ) +#define STOREU128(p,r) _mm_storeu_si128((__m128i *)(p), r) + +#define LOAD(p) _mm256_load_si256( (__m256i *)(p) ) +#define STORE(p,r) _mm256_store_si256((__m256i *)(p), r) + +#define LOADU(p) _mm256_loadu_si256( (__m256i *)(p) ) +#define STOREU(p,r) _mm256_storeu_si256((__m256i *)(p), r) + +static INLINE uint64_t LOADU64(void const * p) { + uint64_t v; + memcpy(&v, p, sizeof v); + return v; +} + +#define ROTATE16 _mm256_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, \ + 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ) + +#define ROTATE24 _mm256_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, \ + 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ) + +#define ADD(a, b) _mm256_add_epi64(a, b) +#define SUB(a, b) _mm256_sub_epi64(a, b) + +#define XOR(a, b) _mm256_xor_si256(a, b) +#define AND(a, b) _mm256_and_si256(a, b) +#define OR(a, b) _mm256_or_si256(a, b) + +#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) +#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24) +#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16) +#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x))) + +#endif diff --git a/src/crypto/randomx/blake2/avx2/blake2b-load-avx2.h b/src/crypto/randomx/blake2/avx2/blake2b-load-avx2.h new file mode 100644 index 00000000..28278f40 --- /dev/null +++ b/src/crypto/randomx/blake2/avx2/blake2b-load-avx2.h @@ -0,0 +1,340 @@ +#ifndef BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_H +#define BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_H + +#define BLAKE2B_LOAD_MSG_0_1(b0) do { \ + t0 = _mm256_unpacklo_epi64(m0, m1); \ + t1 = _mm256_unpacklo_epi64(m2, m3); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m0, m1);\ +t1 = _mm256_unpackhi_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m4);\ +t1 = _mm256_unpacklo_epi64(m5, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_unpackhi_epi64(m5, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m2);\ +t1 = _mm256_unpackhi_epi64(m4, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_alignr_epi8(m3, m7, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_3(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m2, m0);\ +t1 = _mm256_blend_epi32(m5, m0, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_4(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m6, m1, 8);\ +t1 = _mm256_blend_epi32(m3, m1, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_1(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m6, m5, 8);\ +t1 = _mm256_unpackhi_epi64(m2, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m0);\ +t1 = _mm256_blend_epi32(m6, m1, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_3(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m5, m4, 8);\ +t1 = _mm256_unpackhi_epi64(m1, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m2, m7);\ +t1 = _mm256_blend_epi32(m0, m3, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m3, m1);\ +t1 = _mm256_unpackhi_epi64(m6, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m0);\ +t1 = _mm256_unpacklo_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_3(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m1, m7, 8);\ +t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE(1,0,3,2));\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m3);\ +t1 = _mm256_unpacklo_epi64(m5, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m2);\ +t1 = _mm256_unpacklo_epi64(m1, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_2(b0) \ +do { \ +t0 = _mm256_blend_epi32(m3, m0, 0x33);\ +t1 = _mm256_blend_epi32(m7, m2, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_3(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m7, m1, 8);\ +t1 = _mm256_alignr_epi8(m3, m5, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m6, m0);\ +t1 = _mm256_unpacklo_epi64(m6, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m1, m3);\ +t1 = _mm256_unpacklo_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m6, m5);\ +t1 = _mm256_unpackhi_epi64(m5, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_3(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m2, m0, 8);\ +t1 = _mm256_unpackhi_epi64(m3, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m6);\ +t1 = _mm256_alignr_epi8(m7, m2, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_1(b0) \ +do { \ +t0 = _mm256_blend_epi32(m0, m6, 0x33);\ +t1 = _mm256_unpacklo_epi64(m7, m2);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m2, m7);\ +t1 = _mm256_alignr_epi8(m5, m6, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m0);\ +t1 = _mm256_blend_epi32(m4, m3, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m5, m3);\ +t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE(1,0,3,2));\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m6, m3);\ +t1 = _mm256_blend_epi32(m1, m6, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_2(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m7, m5, 8);\ +t1 = _mm256_unpackhi_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m2, m1, 0x33);\ +t1 = _mm256_alignr_epi8(m4, m7, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m0);\ +t1 = _mm256_unpacklo_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m3, m7);\ +t1 = _mm256_alignr_epi8(m0, m5, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_alignr_epi8(m4, m1, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m6);\ +t1 = _mm256_unpackhi_epi64(m6, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_4(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m1, m2, 8);\ +t1 = _mm256_alignr_epi8(m2, m3, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_unpackhi_epi64(m3, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m1, m2);\ +t1 = _mm256_blend_epi32(m2, m3, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_3(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m6, m7);\ +t1 = _mm256_unpackhi_epi64(m4, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_4(b0) \ +do { \ +t0 = _mm256_blend_epi32(m5, m0, 0x33);\ +t1 = _mm256_unpacklo_epi64(m7, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m0, m1);\ +t1 = _mm256_unpacklo_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m0, m1);\ +t1 = _mm256_unpackhi_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m4);\ +t1 = _mm256_unpacklo_epi64(m5, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_unpackhi_epi64(m5, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m2);\ +t1 = _mm256_unpackhi_epi64(m4, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_alignr_epi8(m3, m7, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_3(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m2, m0);\ +t1 = _mm256_blend_epi32(m5, m0, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_4(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m6, m1, 8);\ +t1 = _mm256_blend_epi32(m3, m1, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#endif + diff --git a/src/crypto/randomx/blake2/avx2/blake2b.h b/src/crypto/randomx/blake2/avx2/blake2b.h new file mode 100644 index 00000000..a4598fd8 --- /dev/null +++ b/src/crypto/randomx/blake2/avx2/blake2b.h @@ -0,0 +1,16 @@ +#ifndef BLAKE2_AVX2_BLAKE2B_H +#define BLAKE2_AVX2_BLAKE2B_H + +#include + +#if defined(__cplusplus) +extern "C" { +#endif + +int blake2b_avx2(void* out, size_t outlen, const void* in, size_t inlen); + +#if defined(__cplusplus) +} +#endif + +#endif diff --git a/src/crypto/randomx/blake2/avx2/blake2b_avx2.c b/src/crypto/randomx/blake2/avx2/blake2b_avx2.c new file mode 100644 index 00000000..6177ea19 --- /dev/null +++ b/src/crypto/randomx/blake2/avx2/blake2b_avx2.c @@ -0,0 +1,141 @@ +#include +#include +#include +#include + +#include "blake2.h" +#include "blake2b.h" +#include "blake2b-common.h" + +ALIGN(64) static const uint64_t blake2b_IV[8] = { + UINT64_C(0x6A09E667F3BCC908), UINT64_C(0xBB67AE8584CAA73B), + UINT64_C(0x3C6EF372FE94F82B), UINT64_C(0xA54FF53A5F1D36F1), + UINT64_C(0x510E527FADE682D1), UINT64_C(0x9B05688C2B3E6C1F), + UINT64_C(0x1F83D9ABFB41BD6B), UINT64_C(0x5BE0CD19137E2179), +}; + +#define BLAKE2B_G1_V1(a, b, c, d, m) do { \ + a = ADD(a, m); \ + a = ADD(a, b); d = XOR(d, a); d = ROT32(d); \ + c = ADD(c, d); b = XOR(b, c); b = ROT24(b); \ +} while(0) + +#define BLAKE2B_G2_V1(a, b, c, d, m) do { \ + a = ADD(a, m); \ + a = ADD(a, b); d = XOR(d, a); d = ROT16(d); \ + c = ADD(c, d); b = XOR(b, c); b = ROT63(b); \ +} while(0) + +#define BLAKE2B_DIAG_V1(a, b, c, d) do { \ + a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(2,1,0,3)); \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1,0,3,2)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(0,3,2,1)); \ +} while(0) + +#define BLAKE2B_UNDIAG_V1(a, b, c, d) do { \ + a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(0,3,2,1)); \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1,0,3,2)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(2,1,0,3)); \ +} while(0) + +#include "blake2b-load-avx2.h" + +#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \ + __m256i b0; \ + BLAKE2B_LOAD_MSG_ ##r ##_1(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_ ##r ##_2(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_DIAG_V1(a, b, c, d); \ + BLAKE2B_LOAD_MSG_ ##r ##_3(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_ ##r ##_4(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_UNDIAG_V1(a, b, c, d); \ +} while(0) + +#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \ + BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \ +} while(0) + +#define DECLARE_MESSAGE_WORDS(m) \ + const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \ + const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \ + const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \ + const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \ + const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \ + const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \ + const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \ + const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \ + __m256i t0, t1; + +#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) do { \ + DECLARE_MESSAGE_WORDS(m) \ + const __m256i iv0 = a; \ + const __m256i iv1 = b; \ + __m256i c = LOAD(&blake2b_IV[0]); \ + __m256i d = XOR( \ + LOAD(&blake2b_IV[4]), \ + _mm256_set_epi64x(f1, f0, t1, t0) \ + ); \ + BLAKE2B_ROUNDS_V1(a, b, c, d, m); \ + a = XOR(a, c); \ + b = XOR(b, d); \ + a = XOR(a, iv0); \ + b = XOR(b, iv1); \ +} while(0) + +int blake2b_avx2(void* out_ptr, size_t outlen, const void* in_ptr, size_t inlen) { + const __m256i parameter_block = _mm256_set_epi64x(0, 0, 0, 0x01010000UL | (uint32_t)outlen); + ALIGN(64) uint8_t buffer[BLAKE2B_BLOCKBYTES]; + __m256i a = XOR(LOAD(&blake2b_IV[0]), parameter_block); + __m256i b = LOAD(&blake2b_IV[4]); + uint64_t counter = 0; + const uint8_t* in = (const uint8_t*)in_ptr; + do { + const uint64_t flag = (inlen <= BLAKE2B_BLOCKBYTES) ? -1 : 0; + size_t block_size = BLAKE2B_BLOCKBYTES; + if(inlen < BLAKE2B_BLOCKBYTES) { + memcpy(buffer, in, inlen); + memset(buffer + inlen, 0, BLAKE2B_BLOCKBYTES - inlen); + block_size = inlen; + in = buffer; + } + counter += block_size; + BLAKE2B_COMPRESS_V1(a, b, in, counter, 0, flag, 0); + inlen -= block_size; + in += block_size; + } while(inlen > 0); + + uint8_t* out = (uint8_t*)out_ptr; + + switch (outlen) { + case 64: + STOREU(out + 32, b); + // Fall through + + case 32: + STOREU(out, a); + break; + + default: + STOREU(buffer, a); + STOREU(buffer + 32, b); + memcpy(out, buffer, outlen); + break; + } + + _mm256_zeroupper(); + return 0; +} diff --git a/src/crypto/randomx/blake2/blake2.h b/src/crypto/randomx/blake2/blake2.h index 52f05b39..dc81a593 100644 --- a/src/crypto/randomx/blake2/blake2.h +++ b/src/crypto/randomx/blake2/blake2.h @@ -92,7 +92,12 @@ extern "C" { int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen); /* Simple API */ - int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen); + void rx_blake2b_compress_integer(blake2b_state * S, const uint8_t * block); + void rx_blake2b_compress_sse41(blake2b_state * S, const uint8_t * block); + int rx_blake2b_default(void* out, size_t outlen, const void* in, size_t inlen); + + extern void (*rx_blake2b_compress)(blake2b_state * S, const uint8_t * block); + extern int (*rx_blake2b)(void* out, size_t outlen, const void* in, size_t inlen); /* Argon2 Team - Begin Code */ int rxa2_blake2b_long(void *out, size_t outlen, const void *in, size_t inlen); diff --git a/src/crypto/randomx/blake2/blake2b.c b/src/crypto/randomx/blake2/blake2b.c index 49329e46..6840c75f 100644 --- a/src/crypto/randomx/blake2/blake2b.c +++ b/src/crypto/randomx/blake2/blake2b.c @@ -179,7 +179,7 @@ int rx_blake2b_init_key(blake2b_state *S, size_t outlen, const void *key, size_t return 0; } -static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) { +void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) { uint64_t m[16]; uint64_t v[16]; unsigned int i, r; @@ -237,21 +237,6 @@ static void rx_blake2b_compress_integer(blake2b_state *S, const uint8_t *block) #undef ROUND } -#if defined(XMRIG_FEATURE_SSE4_1) - -uint32_t rx_blake2b_use_sse41 = 0; -void rx_blake2b_compress_sse41(blake2b_state* S, const uint8_t* block); - -#define rx_blake2b_compress(S, block) \ - if (rx_blake2b_use_sse41) \ - rx_blake2b_compress_sse41(S, block); \ - else \ - rx_blake2b_compress_integer(S, block); - -#else -#define rx_blake2b_compress(S, block) rx_blake2b_compress_integer(S, block); -#endif - int rx_blake2b_update(blake2b_state *S, const void *in, size_t inlen) { const uint8_t *pin = (const uint8_t *)in; @@ -322,7 +307,7 @@ int rx_blake2b_final(blake2b_state *S, void *out, size_t outlen) { return 0; } -int rx_blake2b(void *out, size_t outlen, const void *in, size_t inlen) { +int rx_blake2b_default(void *out, size_t outlen, const void *in, size_t inlen) { blake2b_state S; int ret = -1; diff --git a/src/crypto/randomx/bytecode_machine.hpp b/src/crypto/randomx/bytecode_machine.hpp index 8852f4d6..247e76e9 100644 --- a/src/crypto/randomx/bytecode_machine.hpp +++ b/src/crypto/randomx/bytecode_machine.hpp @@ -240,10 +240,17 @@ namespace randomx { return x; } + void cleanup() { + for (unsigned i = 0; i < RegistersCount; ++i) { + registerUsage[i] = -1; + } + nreg = nullptr; + } + private: static const int_reg_t zero; - int registerUsage[RegistersCount]; - NativeRegisterFile* nreg; + int registerUsage[RegistersCount] = {}; + NativeRegisterFile* nreg = nullptr; static void* getScratchpadAddress(InstructionByteCode& ibc, uint8_t* scratchpad) { uint32_t addr = (*ibc.isrc + ibc.imm) & ibc.memMask; diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index 3860f8fb..4a084fc0 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -167,6 +167,11 @@ namespace randomx { static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8, NOP9 }; + static const uint8_t NOP13[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; + static const uint8_t NOP14[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t NOP25[] = { 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t NOP26[] = { 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; + static const uint8_t JMP_ALIGN_PREFIX[14][16] = { {}, {0x2E}, @@ -257,6 +262,10 @@ namespace randomx { // AVX2 init is faster on Zen3 initDatasetAVX2 = true; break; + case xmrig::ICpuInfo::ARCH_ZEN4: + // AVX2 init is slower on Zen4 + initDatasetAVX2 = false; + break; } } } @@ -420,7 +429,7 @@ namespace randomx { *(uint32_t*)(code + codePos + 14) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated; if (hasAVX) { uint32_t* p = (uint32_t*)(code + codePos + 61); - *p = (*p & 0xFF000000U) | 0x0077F8C5U; + *p = (*p & 0xFF000000U) | 0x0077F8C5U; // vzeroupper } # ifdef XMRIG_FIX_RYZEN @@ -432,7 +441,8 @@ namespace randomx { memcpy(imul_rcp_storage - 34, &pcfg.eMask, sizeof(pcfg.eMask)); codePos = codePosFirst; - prevCFROUND = 0; + prevCFROUND = -1; + prevFPOperation = -1; //mark all registers as used uint64_t* r = (uint64_t*)registerUsage; @@ -1168,7 +1178,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; + prevFPOperation = pos; const uint64_t dst = instr.dst % RegisterCountFlt; const uint64_t src = instr.src % RegisterCountFlt; @@ -1183,7 +1193,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; + prevFPOperation = pos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegisterCountFlt; @@ -1200,7 +1210,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; + prevFPOperation = pos; const uint64_t dst = instr.dst % RegisterCountFlt; const uint64_t src = instr.src % RegisterCountFlt; @@ -1215,7 +1225,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; + prevFPOperation = pos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegisterCountFlt; @@ -1243,7 +1253,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; + prevFPOperation = pos; const uint64_t dst = instr.dst % RegisterCountFlt; const uint64_t src = instr.src % RegisterCountFlt; @@ -1258,7 +1268,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; + prevFPOperation = pos; const uint32_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegisterCountFlt; @@ -1285,7 +1295,7 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; + prevFPOperation = pos; const uint32_t dst = instr.dst % RegisterCountFlt; @@ -1296,21 +1306,18 @@ namespace randomx { void JitCompilerX86::h_CFROUND(const Instruction& instr) { uint8_t* const p = code; - uint32_t pos = prevCFROUND; + int32_t t = prevCFROUND; - if (pos) { + if (t > prevFPOperation) { if (vm_flags & RANDOMX_FLAG_AMD) { - memcpy(p + pos + 0, NOP9, 9); - memcpy(p + pos + 9, NOP9, 9); - memcpy(p + pos + 18, NOP8, 8); + memcpy(p + t, NOP26, 26); } else { - memcpy(p + pos + 0, NOP8, 8); - memcpy(p + pos + 8, NOP6, 6); + memcpy(p + t, NOP14, 14); } } - pos = codePos; + uint32_t pos = codePos; prevCFROUND = pos; const uint32_t src = instr.src % RegistersCount; @@ -1335,21 +1342,18 @@ namespace randomx { void JitCompilerX86::h_CFROUND_BMI2(const Instruction& instr) { uint8_t* const p = code; - uint32_t pos = prevCFROUND; + int32_t t = prevCFROUND; - if (pos) { + if (t > prevFPOperation) { if (vm_flags & RANDOMX_FLAG_AMD) { - memcpy(p + pos + 0, NOP9, 9); - memcpy(p + pos + 9, NOP9, 9); - memcpy(p + pos + 18, NOP7, 7); + memcpy(p + t, NOP25, 25); } else { - memcpy(p + pos + 0, NOP8, 8); - memcpy(p + pos + 8, NOP5, 5); + memcpy(p + t, NOP13, 13); } } - pos = codePos; + uint32_t pos = codePos; prevCFROUND = pos; const uint64_t src = instr.src % RegistersCount; @@ -1376,10 +1380,15 @@ namespace randomx { uint8_t* const p = code; uint32_t pos = codePos; - prevCFROUND = 0; - const int reg = instr.dst % RegistersCount; - int32_t jmp_offset = registerUsage[reg] - (pos + 16); + int32_t jmp_offset = registerUsage[reg]; + + // if it jumps over the previous FP instruction that uses rounding, treat it as if FP instruction happened now + if (jmp_offset <= prevFPOperation) { + prevFPOperation = pos; + } + + jmp_offset -= pos + 16; if (jccErratum) { const uint32_t branch_begin = static_cast(pos + 7); diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index abc8e74f..15261922 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -89,7 +89,8 @@ namespace randomx { uint32_t codePos = 0; uint32_t codePosFirst = 0; uint32_t vm_flags = 0; - uint32_t prevCFROUND = 0; + int32_t prevCFROUND = -1; + int32_t prevFPOperation = -1; # ifdef XMRIG_FIX_RYZEN std::pair mainLoopBounds; diff --git a/src/crypto/randomx/vm_interpreted.cpp b/src/crypto/randomx/vm_interpreted.cpp index 840ea768..9973a228 100644 --- a/src/crypto/randomx/vm_interpreted.cpp +++ b/src/crypto/randomx/vm_interpreted.cpp @@ -104,6 +104,8 @@ namespace randomx { for (unsigned i = 0; i < RegisterCountFlt; ++i) rx_store_vec_f128(®.e[i].lo, nreg.e[i]); + + cleanup(); } template diff --git a/src/crypto/rx/Rx.cpp b/src/crypto/rx/Rx.cpp index dae49655..1d9fa293 100644 --- a/src/crypto/rx/Rx.cpp +++ b/src/crypto/rx/Rx.cpp @@ -18,6 +18,7 @@ */ #include "crypto/rx/Rx.h" +#include "backend/cpu/Cpu.h" #include "backend/cpu/CpuConfig.h" #include "backend/cpu/CpuThreads.h" #include "crypto/rx/RxConfig.h" @@ -84,6 +85,16 @@ void xmrig::Rx::init(IRxListener *listener) } +#include "crypto/randomx/blake2/blake2.h" +#if defined(XMRIG_FEATURE_AVX2) +#include "crypto/randomx/blake2/avx2/blake2b.h" +#endif + + +void (*rx_blake2b_compress)(blake2b_state* S, const uint8_t * block) = rx_blake2b_compress_integer; +int (*rx_blake2b)(void* out, size_t outlen, const void* in, size_t inlen) = rx_blake2b_default; + + template bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu) { @@ -133,6 +144,19 @@ bool xmrig::Rx::init(const T &seed, const RxConfig &config, const CpuConfig &cpu if (!cpu.isHwAES()) { SelectSoftAESImpl(cpu.threads().get(seed.algorithm()).count()); } + +# if defined(XMRIG_FEATURE_SSE4_1) + if (Cpu::info()->has(ICpuInfo::FLAG_SSE41)) { + rx_blake2b_compress = rx_blake2b_compress_sse41; + } +# endif + +#if defined(XMRIG_FEATURE_AVX2) + if (Cpu::info()->has(ICpuInfo::FLAG_AVX2)) { + rx_blake2b = blake2b_avx2; + } +# endif + osInitialized = true; } diff --git a/src/crypto/rx/RxConfig.cpp b/src/crypto/rx/RxConfig.cpp index 1cd6f6eb..36287607 100644 --- a/src/crypto/rx/RxConfig.cpp +++ b/src/crypto/rx/RxConfig.cpp @@ -58,12 +58,13 @@ static const std::array modeNames = { "auto", " #ifdef XMRIG_FEATURE_MSR -constexpr size_t kMsrArraySize = 5; +constexpr size_t kMsrArraySize = 6; static const std::array msrPresets = { MsrItems(), MsrItems{{ 0xC0011020, 0ULL }, { 0xC0011021, 0x40ULL, ~0x20ULL }, { 0xC0011022, 0x1510000ULL }, { 0xC001102b, 0x2000cc16ULL }}, MsrItems{{ 0xC0011020, 0x0004480000000000ULL }, { 0xC0011021, 0x001c000200000040ULL, ~0x20ULL }, { 0xC0011022, 0xc000000401500000ULL }, { 0xC001102b, 0x2000cc14ULL }}, + MsrItems{{ 0xC0011020, 0x0004400000000000ULL }, { 0xC0011021, 0x0004000000000040ULL, ~0x20ULL }, { 0xC0011022, 0x8680000401570000ULL }, { 0xC001102b, 0x2040cc10ULL }}, MsrItems{{ 0x1a4, 0xf }}, MsrItems() }; diff --git a/src/crypto/rx/RxVm.cpp b/src/crypto/rx/RxVm.cpp index f4b37375..acaa25e0 100644 --- a/src/crypto/rx/RxVm.cpp +++ b/src/crypto/rx/RxVm.cpp @@ -25,11 +25,6 @@ #include "crypto/rx/RxVm.h" -#if defined(XMRIG_FEATURE_SSE4_1) -extern "C" uint32_t rx_blake2b_use_sse41; -#endif - - randomx_vm *xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool softAes, const Assembly &assembly, uint32_t node) { int flags = 0; @@ -51,10 +46,6 @@ randomx_vm *xmrig::RxVm::create(RxDataset *dataset, uint8_t *scratchpad, bool so flags |= RANDOMX_FLAG_AMD; } -# if defined(XMRIG_FEATURE_SSE4_1) - rx_blake2b_use_sse41 = Cpu::info()->has(ICpuInfo::FLAG_SSE41) ? 1 : 0; -# endif - return randomx_create_vm(static_cast(flags), !dataset->get() ? dataset->cache()->get() : nullptr, dataset->get(), scratchpad, node); } diff --git a/src/version.h b/src/version.h index 396c76c1..92f98fc8 100644 --- a/src/version.h +++ b/src/version.h @@ -22,7 +22,7 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "XMRig miner" -#define APP_VERSION "6.18.0-mo1" +#define APP_VERSION "6.18.1-mo1" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2022 xmrig.com" @@ -30,7 +30,7 @@ #define APP_VER_MAJOR 6 #define APP_VER_MINOR 18 -#define APP_VER_PATCH 0 +#define APP_VER_PATCH 1 #ifdef _MSC_VER # if (_MSC_VER >= 1930)