diff --git a/src/backend/common/Workers.cpp b/src/backend/common/Workers.cpp index 9282a53c..a70affe6 100644 --- a/src/backend/common/Workers.cpp +++ b/src/backend/common/Workers.cpp @@ -214,13 +214,6 @@ void xmrig::Workers::start(const std::vector &data, bool sleep) for (auto worker : m_workers) { worker->start(Workers::onReady); - - // This sleep is important for optimal caching! - // Threads must allocate scratchpads in order so that adjacent cores will use adjacent scratchpads - // Sub-optimal caching can result in up to 0.5% hashrate penalty - if (sleep) { - std::this_thread::sleep_for(std::chrono::milliseconds(20)); - } } } diff --git a/src/backend/cpu/interfaces/ICpuInfo.h b/src/backend/cpu/interfaces/ICpuInfo.h index 7c08c103..44ec3301 100644 --- a/src/backend/cpu/interfaces/ICpuInfo.h +++ b/src/backend/cpu/interfaces/ICpuInfo.h @@ -53,6 +53,7 @@ public: enum Flag : uint32_t { FLAG_AES, + FLAG_AVX, FLAG_AVX2, FLAG_AVX512F, FLAG_BMI2, @@ -80,9 +81,11 @@ public: virtual Assembly::Id assembly() const = 0; virtual bool has(Flag feature) const = 0; virtual bool hasAES() const = 0; + virtual bool hasAVX() const = 0; virtual bool hasAVX2() const = 0; virtual bool hasBMI2() const = 0; virtual bool hasOneGbPages() const = 0; + virtual bool hasXOP() const = 0; virtual bool hasCatL3() const = 0; virtual bool isVM() const = 0; virtual const char *backend() const = 0; diff --git a/src/backend/cpu/platform/BasicCpuInfo.cpp b/src/backend/cpu/platform/BasicCpuInfo.cpp index 17716814..ae0f9aa3 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.cpp +++ b/src/backend/cpu/platform/BasicCpuInfo.cpp @@ -52,8 +52,8 @@ namespace xmrig { -constexpr size_t kCpuFlagsSize = 13; -static const std::array flagNames = { "aes", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" }; +constexpr size_t kCpuFlagsSize = 14; +static const std::array flagNames = { "aes", "avx", "avx2", "avx512f", "bmi2", "osxsave", "pdpe1gb", "sse2", "ssse3", "sse4.1", "xop", "popcnt", "cat_l3", "vm" }; static_assert(kCpuFlagsSize == ICpuInfo::FLAG_MAX, "kCpuFlagsSize and FLAG_MAX mismatch"); @@ -134,11 +134,12 @@ static inline uint64_t xgetbv() #endif } -static inline bool has_xcr_avx2() { return (xgetbv() & 0x06) == 0x06; } +static inline bool has_xcr_avx() { return (xgetbv() & 0x06) == 0x06; } static inline bool has_xcr_avx512() { return (xgetbv() & 0xE6) == 0xE6; } static inline bool has_osxsave() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 27); } static inline bool has_aes_ni() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 25); } -static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx2(); } +static inline bool has_avx() { return has_feature(PROCESSOR_INFO, ECX_Reg, 1 << 28) && has_osxsave() && has_xcr_avx(); } +static inline bool has_avx2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 5) && has_osxsave() && has_xcr_avx(); } static inline bool has_avx512f() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 16) && has_osxsave() && has_xcr_avx512(); } static inline bool has_bmi2() { return has_feature(EXTENDED_FEATURES, EBX_Reg, 1 << 8); } static inline bool has_pdpe1gb() { return has_feature(PROCESSOR_EXT_INFO, EDX_Reg, 1 << 26); } @@ -175,6 +176,7 @@ xmrig::BasicCpuInfo::BasicCpuInfo() : cpu_brand_string(m_brand); m_flags.set(FLAG_AES, has_aes_ni()); + m_flags.set(FLAG_AVX, has_avx()); m_flags.set(FLAG_AVX2, has_avx2()); m_flags.set(FLAG_AVX512F, has_avx512f()); m_flags.set(FLAG_BMI2, has_bmi2()); diff --git a/src/backend/cpu/platform/BasicCpuInfo.h b/src/backend/cpu/platform/BasicCpuInfo.h index 5504d07b..d21b6d61 100644 --- a/src/backend/cpu/platform/BasicCpuInfo.h +++ b/src/backend/cpu/platform/BasicCpuInfo.h @@ -48,9 +48,11 @@ protected: inline Assembly::Id assembly() const override { return m_assembly; } inline bool has(Flag flag) const override { return m_flags.test(flag); } inline bool hasAES() const override { return has(FLAG_AES); } + inline bool hasAVX() const override { return has(FLAG_AVX); } inline bool hasAVX2() const override { return has(FLAG_AVX2); } inline bool hasBMI2() const override { return has(FLAG_BMI2); } inline bool hasOneGbPages() const override { return has(FLAG_PDPE1GB); } + inline bool hasXOP() const override { return has(FLAG_XOP); } inline bool hasCatL3() const override { return has(FLAG_CAT_L3); } inline bool isVM() const override { return has(FLAG_VM); } inline const char *brand() const override { return m_brand; } diff --git a/src/crypto/randomx/asm/program_sshash_avx2_constants.inc b/src/crypto/randomx/asm/program_sshash_avx2_constants.inc new file mode 100644 index 00000000..e2e5e0b1 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_constants.inc @@ -0,0 +1,28 @@ +r0_avx2_increments: + db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0 +mul_hi_avx2_data: + db 0,0,0,0,1,0,0,0 +r0_avx2_mul: + ;#/ 6364136223846793005 + db 45, 127, 149, 76, 45, 244, 81, 88 +r1_avx2_add: + ;#/ 9298411001130361340 + db 252, 161, 245, 89, 138, 151, 10, 129 +r2_avx2_add: + ;#/ 12065312585734608966 + db 70, 216, 194, 56, 223, 153, 112, 167 +r3_avx2_add: + ;#/ 9306329213124626780 + db 92, 73, 34, 191, 28, 185, 38, 129 +r4_avx2_add: + ;#/ 5281919268842080866 + db 98, 138, 159, 23, 151, 37, 77, 73 +r5_avx2_add: + ;#/ 10536153434571861004 + db 12, 236, 170, 206, 185, 239, 55, 146 +r6_avx2_add: + ;#/ 3398623926847679864 + db 120, 45, 230, 108, 116, 86, 42, 47 +r7_avx2_add: + ;#/ 9549104520008361294 + db 78, 229, 44, 182, 247, 59, 133, 132 \ No newline at end of file diff --git a/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc b/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc new file mode 100644 index 00000000..88204d99 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc @@ -0,0 +1,31 @@ + add rsp, 32 + pop r9 + + movdqu xmm0, xmmword ptr [rsp] + movdqu xmm1, xmmword ptr [rsp + 16] + movdqu xmm2, xmmword ptr [rsp + 32] + movdqu xmm3, xmmword ptr [rsp + 48] + movdqu xmm4, xmmword ptr [rsp + 64] + movdqu xmm5, xmmword ptr [rsp + 80] + movdqu xmm6, xmmword ptr [rsp + 96] + movdqu xmm7, xmmword ptr [rsp + 112] + movdqu xmm8, xmmword ptr [rsp + 128] + movdqu xmm9, xmmword ptr [rsp + 144] + movdqu xmm10, xmmword ptr [rsp + 160] + movdqu xmm11, xmmword ptr [rsp + 176] + movdqu xmm12, xmmword ptr [rsp + 192] + movdqu xmm13, xmmword ptr [rsp + 208] + movdqu xmm14, xmmword ptr [rsp + 224] + movdqu xmm15, xmmword ptr [rsp + 240] + vzeroupper + add rsp, 256 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rsi + pop rdi + pop rbp + pop rbx + ret diff --git a/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc b/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc new file mode 100644 index 00000000..8055cf28 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc @@ -0,0 +1,37 @@ + ;# prefetch RandomX dataset lines + prefetchnta byte ptr [rsi] + prefetchnta byte ptr [rsi+64] + prefetchnta byte ptr [rsi+128] + prefetchnta byte ptr [rsi+192] + prefetchnta byte ptr [rsi+256] + + ;# prefetch RandomX cache lines + mov rbx, rbp + and rbx, RANDOMX_CACHE_MASK + shl rbx, 6 + add rbx, rdi + prefetchnta byte ptr [rbx] + lea rax, [rbp+1] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp], rax + lea rax, [rbp+2] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp+8], rax + lea rax, [rbp+3] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp+16], rax + lea rax, [rbp+4] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + prefetchnta byte ptr [rax] + mov [rsp+24], rax diff --git a/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc b/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc new file mode 100644 index 00000000..46dd469d --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc @@ -0,0 +1,38 @@ + mov qword ptr [rsi+0], r8 + vpunpcklqdq ymm8, ymm0, ymm1 + mov qword ptr [rsi+8], r9 + vpunpcklqdq ymm9, ymm2, ymm3 + mov qword ptr [rsi+16], r10 + vpunpcklqdq ymm10, ymm4, ymm5 + mov qword ptr [rsi+24], r11 + vpunpcklqdq ymm11, ymm6, ymm7 + mov qword ptr [rsi+32], r12 + vpunpckhqdq ymm12, ymm0, ymm1 + mov qword ptr [rsi+40], r13 + vpunpckhqdq ymm13, ymm2, ymm3 + mov qword ptr [rsi+48], r14 + vpunpckhqdq ymm14, ymm4, ymm5 + mov qword ptr [rsi+56], r15 + vpunpckhqdq ymm15, ymm6, ymm7 + + vperm2i128 ymm0, ymm8, ymm9, 32 + vperm2i128 ymm1, ymm10, ymm11, 32 + vmovdqu ymmword ptr [rsi+64], ymm0 + vmovdqu ymmword ptr [rsi+96], ymm1 + vperm2i128 ymm2, ymm12, ymm13, 32 + vperm2i128 ymm3, ymm14, ymm15, 32 + vmovdqu ymmword ptr [rsi+128], ymm2 + vmovdqu ymmword ptr [rsi+160], ymm3 + vperm2i128 ymm4, ymm8, ymm9, 49 + vperm2i128 ymm5, ymm10, ymm11, 49 + vmovdqu ymmword ptr [rsi+192], ymm4 + vmovdqu ymmword ptr [rsi+224], ymm5 + vperm2i128 ymm6, ymm12, ymm13, 49 + vperm2i128 ymm7, ymm14, ymm15, 49 + vmovdqu ymmword ptr [rsi+256], ymm6 + vmovdqu ymmword ptr [rsi+288], ymm7 + + add rbp, 5 + add rsi, 320 + cmp rbp, qword ptr [rsp+32] + db 15, 130, 0, 0, 0, 0 ;# jb rel32 diff --git a/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc b/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc new file mode 100644 index 00000000..a551ffa4 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc @@ -0,0 +1,27 @@ + push rbx + push rbp + push rdi + push rsi + push r12 + push r13 + push r14 + push r15 + + ;# save all XMM registers just to be safe for all calling conventions + sub rsp, 256 + movdqu xmmword ptr [rsp], xmm0 + movdqu xmmword ptr [rsp + 16], xmm1 + movdqu xmmword ptr [rsp + 32], xmm2 + movdqu xmmword ptr [rsp + 48], xmm3 + movdqu xmmword ptr [rsp + 64], xmm4 + movdqu xmmword ptr [rsp + 80], xmm5 + movdqu xmmword ptr [rsp + 96], xmm6 + movdqu xmmword ptr [rsp + 112], xmm7 + movdqu xmmword ptr [rsp + 128], xmm8 + movdqu xmmword ptr [rsp + 144], xmm9 + movdqu xmmword ptr [rsp + 160], xmm10 + movdqu xmmword ptr [rsp + 176], xmm11 + movdqu xmmword ptr [rsp + 192], xmm12 + movdqu xmmword ptr [rsp + 208], xmm13 + movdqu xmmword ptr [rsp + 224], xmm14 + movdqu xmmword ptr [rsp + 240], xmm15 diff --git a/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc b/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc new file mode 100644 index 00000000..bed78094 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc @@ -0,0 +1,50 @@ + sub rsp, 40 + mov [rsp], rbx + vmovdqu ymmword ptr [rsp+8], ymm14 + + mov rax, [rsp+40] + mov rbx, [rsp+48] + mov rcx, [rsp+56] + mov rdx, [rsp+64] + + vmovdqu ymm8, ymmword ptr [rax] ;# ymm8 = r0[1], r1[1], r2[1], r3[1] + vmovdqu ymm9, ymmword ptr [rbx] ;# ymm9 = r0[2], r1[2], r2[2], r3[2] + vmovdqu ymm10, ymmword ptr [rcx] ;# ymm10 = r0[3], r1[3], r2[3], r3[3] + vmovdqu ymm11, ymmword ptr [rdx] ;# ymm11 = r0[4], r1[4], r2[4], r3[4] + + vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r0[1], r0[2], r2[1], r2[2] + vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r0[3], r0[4], r2[3], r2[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r0[1], r0[2], r0[3], r0[4] + vpxor ymm0, ymm0, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r2[1], r2[2], r2[3], r2[4] + vpxor ymm2, ymm2, ymm14 + + vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r1[1], r1[2], r3[1], r3[2] + vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r1[3], r1[4], r3[3], r3[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r1[1], r1[2], r1[3], r1[4] + vpxor ymm1, ymm1, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r3[1], r3[2], r3[3], r3[4] + vpxor ymm3, ymm3, ymm14 + + vmovdqu ymm8, ymmword ptr [rax+32] ;# ymm8 = r4[1], r5[1], r6[1], r7[1] + vmovdqu ymm9, ymmword ptr [rbx+32] ;# ymm9 = r4[2], r5[2], r6[2], r7[2] + vmovdqu ymm10, ymmword ptr [rcx+32] ;# ymm10 = r4[3], r5[3], r6[3], r7[3] + vmovdqu ymm11, ymmword ptr [rdx+32] ;# ymm11 = r4[4], r5[4], r6[4], r7[4] + + vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r4[1], r4[2], r6[1], r6[2] + vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r4[3], r4[4], r6[3], r6[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r4[1], r4[2], r4[3], r4[4] + vpxor ymm4, ymm4, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r6[1], r6[2], r6[3], r6[4] + vpxor ymm6, ymm6, ymm14 + + vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r5[1], r5[2], r7[1], r7[2] + vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r5[3], r5[4], r7[3], r7[4] + vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r5[1], r5[2], r5[3], r5[4] + vpxor ymm5, ymm5, ymm14 + vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r7[1], r7[2], r7[3], r7[4] + vpxor ymm7, ymm7, ymm14 + + mov rbx, [rsp] + vmovdqu ymm14, ymmword ptr [rsp+8] + add rsp, 40 diff --git a/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc b/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc new file mode 100644 index 00000000..072de864 --- /dev/null +++ b/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc @@ -0,0 +1,29 @@ + vmovdqu ymmword ptr [rsp], ymm0 + + mov rax, [rsp] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp], rax + prefetchnta byte ptr [rax] + + mov rax, [rsp+8] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp+8], rax + prefetchnta byte ptr [rax] + + mov rax, [rsp+16] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp+16], rax + prefetchnta byte ptr [rax] + + mov rax, [rsp+24] + and rax, RANDOMX_CACHE_MASK + shl rax, 6 + add rax, rdi + mov [rsp+24], rax + prefetchnta byte ptr [rax] diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp index ee3e1b45..d3a71f94 100644 --- a/src/crypto/randomx/jit_compiler_x86.cpp +++ b/src/crypto/randomx/jit_compiler_x86.cpp @@ -49,8 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef _MSC_VER # include -#else -# include #endif static bool hugePagesJIT = false; @@ -116,6 +114,11 @@ namespace randomx { #define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init) #define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin) #define codeDatasetInit ADDR(randomx_dataset_init) + #define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue) + #define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end) + #define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue) + #define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load) + #define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch) #define codeLoopStore ADDR(randomx_program_loop_store) #define codeLoopEnd ADDR(randomx_program_loop_end) #define codeEpilogue ADDR(randomx_program_epilogue) @@ -132,7 +135,12 @@ namespace randomx { #define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit) #define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin) #define loopStoreSize (codeLoopEnd - codeLoopStore) - #define datasetInitSize (codeEpilogue - codeDatasetInit) + #define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit) + #define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue) + #define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end) + #define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue) + #define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load) + #define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch) #define epilogueSize (codeShhLoad - codeEpilogue) #define codeSshLoadSize (codeShhPrefetch - codeShhLoad) #define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch) @@ -192,17 +200,6 @@ namespace randomx { xmrig::VirtualMemory::protectRX(p1, p2 - p1); } - static inline void cpuid(uint32_t level, int32_t output[4]) - { - memset(output, 0, sizeof(int32_t) * 4); - -# ifdef _MSC_VER - __cpuid(output, static_cast(level)); -# else - __cpuid_count(level, 0, output[0], output[1], output[2], output[3]); -# endif - } - # ifdef _MSC_VER static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); } # else @@ -215,14 +212,11 @@ namespace randomx { JitCompilerX86::JitCompilerX86(bool hugePagesEnable) { BranchesWithin32B = xmrig::Cpu::info()->jccErratum(); - int32_t info[4]; - cpuid(1, info); - hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0); + hasAVX = xmrig::Cpu::info()->hasAVX(); + hasAVX2 = xmrig::Cpu::info()->hasAVX2(); + hasXOP = xmrig::Cpu::info()->hasXOP(); - cpuid(0x80000001, info); - hasXOP = ((info[2] & (1 << 11)) != 0); - - allocatedSize = CodeSize * 2; + allocatedSize = hasAVX2 ? (CodeSize * 4) : (CodeSize * 2); allocatedCode = static_cast(allocExecutableMemory(allocatedSize, # ifdef XMRIG_SECURE_JIT false @@ -304,14 +298,49 @@ namespace randomx { template void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) { + uint8_t* p = code; + if (hasAVX2) { + codePos = 0; + emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos); + + for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) { + SuperscalarProgram& prog = programs[j]; + uint32_t pos = codePos; + for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { + generateSuperscalarCode(prog(i), p, pos); + } + codePos = pos; + emit(codeShhLoad, codeSshLoadSize, code, codePos); + emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos); + if (j < RandomX_CurrentConfig.CacheAccesses - 1) { + *(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast(prog.getAddressRegister()) << 16); + codePos += 3; + emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos); + uint8_t* p = code + codePos; + emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos); + p[3] += prog.getAddressRegister() << 3; + } + } + + emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos); + + // Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label + constexpr int32_t prologue_size = 320; + *(int32_t*)(code + codePos - 4) = prologue_size - codePos; + + emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos); + return; + } + memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); codePos = superScalarHashOffset + codeSshInitSize; for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) { SuperscalarProgram& prog = programs[j]; - for (unsigned i = 0; i < prog.getSize(); ++i) { - Instruction& instr = prog(i); - generateSuperscalarCode(instr); + uint32_t pos = codePos; + for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { + generateSuperscalarCode(prog(i), p, pos); } + codePos = pos; emit(codeShhLoad, codeSshLoadSize, code, codePos); if (j < RandomX_CurrentConfig.CacheAccesses - 1) { *(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast(prog.getAddressRegister()) << 16); @@ -326,7 +355,10 @@ namespace randomx { void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]); void JitCompilerX86::generateDatasetInitCode() { - memcpy(code, codeDatasetInit, datasetInitSize); + // AVX2 code is generated in generateSuperscalarHash() + if (!hasAVX2) { + memcpy(code, codeDatasetInit, datasetInitSize); + } } void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { @@ -405,85 +437,243 @@ namespace randomx { emit32(epilogueOffset - codePos - 4, code, codePos); } - void JitCompilerX86::generateSuperscalarCode(Instruction& instr) { - static constexpr uint8_t REX_SUB_RR[] = { 0x4d, 0x2b }; - static constexpr uint8_t REX_MOV_RR64[] = { 0x49, 0x8b }; - static constexpr uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b }; - static constexpr uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf }; - static constexpr uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf }; - static constexpr uint8_t REX_MUL_R[] = { 0x49, 0xf7 }; - static constexpr uint8_t REX_81[] = { 0x49, 0x81 }; - static constexpr uint8_t MOV_RAX_I[] = { 0x48, 0xb8 }; - static constexpr uint8_t REX_LEA[] = { 0x4f, 0x8d }; - static constexpr uint8_t REX_XOR_RR[] = { 0x4D, 0x33 }; - static constexpr uint8_t REX_XOR_RI[] = { 0x49, 0x81 }; - static constexpr uint8_t REX_ROT_I8[] = { 0x49, 0xc1 }; - + template + FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) { switch ((SuperscalarInstructionType)instr.opcode) { case randomx::SuperscalarInstructionType::ISUB_R: - emit(REX_SUB_RR, code, codePos); - emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16); + codePos += 3; + if (AVX2) { + emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); + } break; case randomx::SuperscalarInstructionType::IXOR_R: - emit(REX_XOR_RR, code, codePos); - emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16); + codePos += 3; + if (AVX2) { + emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); + } break; case randomx::SuperscalarInstructionType::IADD_RS: - emit(REX_LEA, code, codePos); - emitByte(0x04 + 8 * instr.dst, code, codePos); - genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos); + emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos); + if (AVX2) { + if (instr.getModShift()) { + static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.src; + p[4] = instr.getModShift(); + p[8] += instr.dst * 9; + } + else { + emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); + } + } break; case randomx::SuperscalarInstructionType::IMUL_R: - emit(REX_IMUL_RR, code, codePos); - emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos); + emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos); + if (AVX2) { + static const uint8_t t[] = { + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC5, 0xB5, 0x73, 0xD0, 0x20, + 0xC5, 0x7D, 0xF4, 0xD0, + 0xC5, 0x35, 0xF4, 0xD8, + 0xC5, 0xBD, 0xF4, 0xC0, + 0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20, + 0xC5, 0xFD, 0x73, 0xF0, 0x20, + 0xC4, 0x41, 0x2D, 0xD4, 0xD3, + 0xC5, 0xAD, 0xD4, 0xC0 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[8] += instr.src; + p[11] -= instr.dst * 8; + p[13] += instr.src; + p[17] += instr.dst; + p[21] += instr.dst * 8 + instr.src; + p[29] -= instr.dst * 8; + p[31] += instr.dst; + p[41] += instr.dst * 9; + } break; case randomx::SuperscalarInstructionType::IROR_C: - emit(REX_ROT_I8, code, codePos); - emitByte(0xc8 + instr.dst, code, codePos); - emitByte(instr.getImm32() & 63, code, codePos); + { + const uint32_t shift = instr.getImm32() & 63; + emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos); + if (AVX2) { + static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[4] = shift; + p[8] += instr.dst; + p[9] = 64 - shift; + p[14] += instr.dst * 8; + } + } break; case randomx::SuperscalarInstructionType::IADD_C7: case randomx::SuperscalarInstructionType::IADD_C8: case randomx::SuperscalarInstructionType::IADD_C9: - emit(REX_81, code, codePos); - emitByte(0xc0 + instr.dst, code, codePos); - emit32(instr.getImm32(), code, codePos); + if (AVX2) { + static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + *(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32()); + p[12] += instr.dst * 8; + p[24] -= instr.dst * 8; + p[26] += instr.dst * 8; + } + else { + *(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16); + codePos += 3; + emit32(instr.getImm32(), code, codePos); + } break; case randomx::SuperscalarInstructionType::IXOR_C7: case randomx::SuperscalarInstructionType::IXOR_C8: case randomx::SuperscalarInstructionType::IXOR_C9: - emit(REX_XOR_RI, code, codePos); - emitByte(0xf0 + instr.dst, code, codePos); - emit32(instr.getImm32(), code, codePos); + if (AVX2) { + static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + *(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32()); + p[12] += instr.dst * 8; + p[24] -= instr.dst * 8; + p[26] += instr.dst * 8; + } + else { + *(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16); + codePos += 3; + emit32(instr.getImm32(), code, codePos); + } break; case randomx::SuperscalarInstructionType::IMULH_R: - emit(REX_MOV_RR64, code, codePos); - emitByte(0xc0 + instr.dst, code, codePos); - emit(REX_MUL_R, code, codePos); - emitByte(0xe0 + instr.src, code, codePos); - emit(REX_MOV_R64R, code, codePos); - emitByte(0xc2 + 8 * instr.dst, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19); + codePos += 3; + if (AVX2) { + static const uint8_t t[] = { + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC5, 0xB5, 0x73, 0xD0, 0x20, + 0xC5, 0x7D, 0xF4, 0xD0, + 0xC5, 0x3D, 0xF4, 0xD8, + 0xC4, 0x41, 0x7D, 0xF4, 0xE1, + 0xC4, 0xC1, 0x3D, 0xF4, 0xC1, + 0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20, + 0xC4, 0x41, 0x25, 0xEF, 0xC6, + 0xC4, 0x41, 0x25, 0xD4, 0xDC, + 0xC4, 0x41, 0x25, 0xD4, 0xDA, + 0xC4, 0x41, 0x25, 0xEF, 0xCE, + 0xC4, 0x42, 0x3D, 0x37, 0xC1, + 0xC4, 0x41, 0x3D, 0xDB, 0xC7, + 0xC5, 0xBD, 0xD4, 0xC0, + 0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20, + 0xC5, 0xA5, 0xD4, 0xC0 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[8] += instr.src; + p[11] -= instr.dst * 8; + p[13] += instr.src; + p[17] += instr.src; + p[20] -= instr.dst * 8; + p[27] += instr.dst * 8; + p[67] += instr.dst * 9; + p[77] += instr.dst * 9; + } break; case randomx::SuperscalarInstructionType::ISMULH_R: - emit(REX_MOV_RR64, code, codePos); - emitByte(0xc0 + instr.dst, code, codePos); - emit(REX_MUL_R, code, codePos); - emitByte(0xe8 + instr.src, code, codePos); - emit(REX_MOV_R64R, code, codePos); - emitByte(0xc2 + 8 * instr.dst, code, codePos); + *(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16); + codePos += 3; + *(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19); + codePos += 3; + if (AVX2) { + static const uint8_t t[] = { + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC5, 0xB5, 0x73, 0xD0, 0x20, + 0xC5, 0x7D, 0xF4, 0xD0, + 0xC5, 0x3D, 0xF4, 0xD8, + 0xC4, 0x41, 0x7D, 0xF4, 0xE1, + 0xC4, 0x41, 0x3D, 0xF4, 0xE9, + 0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20, + 0xC4, 0x41, 0x25, 0xEF, 0xC6, + 0xC4, 0x41, 0x25, 0xD4, 0xDC, + 0xC4, 0x41, 0x25, 0xD4, 0xDA, + 0xC4, 0x41, 0x25, 0xEF, 0xCE, + 0xC4, 0x42, 0x3D, 0x37, 0xC1, + 0xC4, 0x41, 0x3D, 0xDB, 0xC7, + 0xC4, 0x41, 0x15, 0xD4, 0xE8, + 0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20, + 0xC4, 0x41, 0x15, 0xD4, 0xC3, + 0xC4, 0x41, 0x35, 0xEF, 0xC9, + 0xC4, 0x62, 0x35, 0x37, 0xD0, + 0xC4, 0x62, 0x35, 0x37, 0xD8, + 0xC5, 0x2D, 0xDB, 0xD0, + 0xC5, 0x25, 0xDB, 0xD8, + 0xC4, 0x41, 0x3D, 0xFB, 0xC2, + 0xC4, 0xC1, 0x3D, 0xFB, 0xC3 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[3] += instr.dst; + p[8] += instr.src; + p[11] -= instr.dst * 8; + p[13] += instr.src; + p[17] += instr.src; + p[20] -= instr.dst * 8; + p[89] += instr.dst; + p[94] += instr.src; + p[98] += instr.src; + p[102] += instr.dst; + p[112] += instr.dst * 8; + } break; case randomx::SuperscalarInstructionType::IMUL_RCP: - emit(MOV_RAX_I, code, codePos); + *(uint32_t*)(code + codePos) = 0x0000B848UL; + codePos += 2; emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos); - emit(REX_IMUL_RM, code, codePos); - emitByte(0xc0 + 8 * instr.dst, code, codePos); + emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos); + if (AVX2) { + static const uint8_t t[] = { + 0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF, + 0xC5, 0xBD, 0x73, 0xD0, 0x20, + 0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20, + 0xC4, 0x41, 0x7D, 0xF4, 0xD4, + 0xC5, 0x35, 0xF4, 0xD8, + 0xC4, 0xC1, 0x3D, 0xF4, 0xC4, + 0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20, + 0xC5, 0xFD, 0x73, 0xF0, 0x20, + 0xC4, 0x41, 0x2D, 0xD4, 0xD3, + 0xC5, 0xAD, 0xD4, 0xC0 + }; + uint8_t* p = code + codePos; + emit(t, code, codePos); + p[12] += instr.dst; + p[22] -= instr.dst * 8; + p[28] += instr.dst; + p[33] += instr.dst * 8; + p[41] -= instr.dst * 8; + p[43] += instr.dst; + p[53] += instr.dst * 9; + } break; default: UNREACHABLE; } } + template void JitCompilerX86::generateSuperscalarCode(Instruction&, uint8_t*, uint32_t&); + template void JitCompilerX86::generateSuperscalarCode(Instruction&, uint8_t*, uint32_t&); + template FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) { *(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16); @@ -563,10 +753,6 @@ namespace randomx { codePos = pos; } - void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos) { - emitByte((scale << 6) | (index << 3) | base, code, codePos); - } - void JitCompilerX86::h_ISUB_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; diff --git a/src/crypto/randomx/jit_compiler_x86.hpp b/src/crypto/randomx/jit_compiler_x86.hpp index 32ca97b5..8aa3484d 100644 --- a/src/crypto/randomx/jit_compiler_x86.hpp +++ b/src/crypto/randomx/jit_compiler_x86.hpp @@ -96,6 +96,7 @@ namespace randomx { bool BranchesWithin32B = false; bool hasAVX; + bool hasAVX2; bool hasXOP; uint8_t* allocatedCode = nullptr; @@ -107,9 +108,10 @@ namespace randomx { static void genAddressReg(const Instruction&, const uint32_t src, uint8_t* code, uint32_t& codePos); static void genAddressRegDst(const Instruction&, uint8_t* code, uint32_t& codePos); static void genAddressImm(const Instruction&, uint8_t* code, uint32_t& codePos); - static void genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos); + static uint32_t genSIB(int scale, int index, int base) { return (scale << 6) | (index << 3) | base; } - void generateSuperscalarCode(Instruction &); + template + void generateSuperscalarCode(Instruction& inst, uint8_t* code, uint32_t& codePos); static void emitByte(uint8_t val, uint8_t* code, uint32_t& codePos) { code[codePos] = val; diff --git a/src/crypto/randomx/jit_compiler_x86_static.S b/src/crypto/randomx/jit_compiler_x86_static.S index 9f3a5bf1..da5ee98e 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.S +++ b/src/crypto/randomx/jit_compiler_x86_static.S @@ -52,6 +52,11 @@ .global DECL(randomx_program_loop_store) .global DECL(randomx_program_loop_end) .global DECL(randomx_dataset_init) +.global DECL(randomx_dataset_init_avx2_prologue) +.global DECL(randomx_dataset_init_avx2_loop_end) +.global DECL(randomx_dataset_init_avx2_epilogue) +.global DECL(randomx_dataset_init_avx2_ssh_load) +.global DECL(randomx_dataset_init_avx2_ssh_prefetch) .global DECL(randomx_program_epilogue) .global DECL(randomx_sshash_load) .global DECL(randomx_sshash_prefetch) @@ -192,6 +197,98 @@ call_offset: pop rbx ret +.balign 64 +DECL(randomx_dataset_init_avx2_prologue): + #include "asm/program_sshash_avx2_save_registers.inc" + +#if defined(WINABI) + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index +#else + mov rdi, qword ptr [rdi] ;# cache->memory + ;# dataset in rsi + mov rbp, rdx ;# block index + push rcx ;# max. block index +#endif + sub rsp, 32 + + jmp randomx_dataset_init_avx2_prologue_loop_begin + #include "asm/program_sshash_avx2_constants.inc" + +.balign 64 +randomx_dataset_init_avx2_prologue_loop_begin: + #include "asm/program_sshash_avx2_loop_begin.inc" + + ;# init integer registers (lane 0) + lea r8, [rbp+1] + imul r8, qword ptr [r0_avx2_mul+rip] + mov r9, qword ptr [r1_avx2_add+rip] + xor r9, r8 + mov r10, qword ptr [r2_avx2_add+rip] + xor r10, r8 + mov r11, qword ptr [r3_avx2_add+rip] + xor r11, r8 + mov r12, qword ptr [r4_avx2_add+rip] + xor r12, r8 + mov r13, qword ptr [r5_avx2_add+rip] + xor r13, r8 + mov r14, qword ptr [r6_avx2_add+rip] + xor r14, r8 + mov r15, qword ptr [r7_avx2_add+rip] + xor r15, r8 + + ;# init AVX registers (lanes 1-4) + vpxor ymm0, ymm0, ymm0 + movq xmm0, rbp + vpbroadcastq ymm0, xmm0 + vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip] + + ;# ymm0 *= r0_avx2_mul + vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip] + vpsrlq ymm8, ymm0, 32 + vpsrlq ymm9, ymm1, 32 + vpmuludq ymm10, ymm0, ymm1 + vpmuludq ymm11, ymm9, ymm0 + vpmuludq ymm0, ymm8, ymm1 + vpsllq ymm11, ymm11, 32 + vpsllq ymm0, ymm0, 32 + vpaddq ymm10, ymm10, ymm11 + vpaddq ymm0, ymm10, ymm0 + + vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip] + vpxor ymm1, ymm0, ymm1 + vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip] + vpxor ymm2, ymm0, ymm2 + vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip] + vpxor ymm3, ymm0, ymm3 + vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip] + vpxor ymm4, ymm0, ymm4 + vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip] + vpxor ymm5, ymm0, ymm5 + vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip] + vpxor ymm6, ymm0, ymm6 + vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip] + vpxor ymm7, ymm0, ymm7 + + vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32) + vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63) + + ;# generated SuperscalarHash code goes here + +DECL(randomx_dataset_init_avx2_loop_end): + #include "asm/program_sshash_avx2_loop_end.inc" + +DECL(randomx_dataset_init_avx2_epilogue): + #include "asm/program_sshash_avx2_epilogue.inc" + +DECL(randomx_dataset_init_avx2_ssh_load): + #include "asm/program_sshash_avx2_ssh_load.inc" + +DECL(randomx_dataset_init_avx2_ssh_prefetch): + #include "asm/program_sshash_avx2_ssh_prefetch.inc" + .balign 64 DECL(randomx_program_epilogue): #include "asm/program_epilogue_store.inc" diff --git a/src/crypto/randomx/jit_compiler_x86_static.asm b/src/crypto/randomx/jit_compiler_x86_static.asm index e36e5aaf..f8a2d527 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.asm +++ b/src/crypto/randomx/jit_compiler_x86_static.asm @@ -41,6 +41,11 @@ PUBLIC randomx_program_read_dataset_ryzen PUBLIC randomx_program_read_dataset_sshash_init PUBLIC randomx_program_read_dataset_sshash_fin PUBLIC randomx_dataset_init +PUBLIC randomx_dataset_init_avx2_prologue +PUBLIC randomx_dataset_init_avx2_loop_end +PUBLIC randomx_dataset_init_avx2_epilogue +PUBLIC randomx_dataset_init_avx2_ssh_load +PUBLIC randomx_dataset_init_avx2_ssh_prefetch PUBLIC randomx_program_loop_store PUBLIC randomx_program_loop_end PUBLIC randomx_program_epilogue @@ -183,6 +188,95 @@ init_block_loop: randomx_dataset_init ENDP ALIGN 64 +randomx_dataset_init_avx2_prologue PROC + include asm/program_sshash_avx2_save_registers.inc + + mov rdi, qword ptr [rcx] ;# cache->memory + mov rsi, rdx ;# dataset + mov rbp, r8 ;# block index + push r9 ;# max. block index + sub rsp, 32 + + jmp loop_begin + include asm/program_sshash_avx2_constants.inc + +ALIGN 64 +loop_begin: + include asm/program_sshash_avx2_loop_begin.inc + + ;# init integer registers (lane 0) + lea r8, [rbp+1] + imul r8, qword ptr [r0_avx2_mul] + mov r9, qword ptr [r1_avx2_add] + xor r9, r8 + mov r10, qword ptr [r2_avx2_add] + xor r10, r8 + mov r11, qword ptr [r3_avx2_add] + xor r11, r8 + mov r12, qword ptr [r4_avx2_add] + xor r12, r8 + mov r13, qword ptr [r5_avx2_add] + xor r13, r8 + mov r14, qword ptr [r6_avx2_add] + xor r14, r8 + mov r15, qword ptr [r7_avx2_add] + xor r15, r8 + + ;# init AVX registers (lanes 1-4) + vpxor ymm0, ymm0, ymm0 + movq xmm0, rbp + vpbroadcastq ymm0, xmm0 + vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments] + + ;# ymm0 *= r0_avx2_mul + vbroadcastsd ymm1, qword ptr [r0_avx2_mul] + vpsrlq ymm8, ymm0, 32 + vpsrlq ymm9, ymm1, 32 + vpmuludq ymm10, ymm0, ymm1 + vpmuludq ymm11, ymm9, ymm0 + vpmuludq ymm0, ymm8, ymm1 + vpsllq ymm11, ymm11, 32 + vpsllq ymm0, ymm0, 32 + vpaddq ymm10, ymm10, ymm11 + vpaddq ymm0, ymm10, ymm0 + + vbroadcastsd ymm1, qword ptr [r1_avx2_add] + vpxor ymm1, ymm0, ymm1 + vbroadcastsd ymm2, qword ptr [r2_avx2_add] + vpxor ymm2, ymm0, ymm2 + vbroadcastsd ymm3, qword ptr [r3_avx2_add] + vpxor ymm3, ymm0, ymm3 + vbroadcastsd ymm4, qword ptr [r4_avx2_add] + vpxor ymm4, ymm0, ymm4 + vbroadcastsd ymm5, qword ptr [r5_avx2_add] + vpxor ymm5, ymm0, ymm5 + vbroadcastsd ymm6, qword ptr [r6_avx2_add] + vpxor ymm6, ymm0, ymm6 + vbroadcastsd ymm7, qword ptr [r7_avx2_add] + vpxor ymm7, ymm0, ymm7 + + vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data] ;# carry_bit (bit 32) + vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63) +randomx_dataset_init_avx2_prologue ENDP + + ;# generated SuperscalarHash code goes here + +randomx_dataset_init_avx2_loop_end PROC + include asm/program_sshash_avx2_loop_end.inc +randomx_dataset_init_avx2_loop_end ENDP + +randomx_dataset_init_avx2_epilogue PROC + include asm/program_sshash_avx2_epilogue.inc +randomx_dataset_init_avx2_epilogue ENDP + +randomx_dataset_init_avx2_ssh_load PROC + include asm/program_sshash_avx2_ssh_load.inc +randomx_dataset_init_avx2_ssh_load ENDP + +randomx_dataset_init_avx2_ssh_prefetch PROC + include asm/program_sshash_avx2_ssh_prefetch.inc +randomx_dataset_init_avx2_ssh_prefetch ENDP + randomx_program_epilogue PROC include asm/program_epilogue_store.inc include asm/program_epilogue_win64.inc diff --git a/src/crypto/randomx/jit_compiler_x86_static.hpp b/src/crypto/randomx/jit_compiler_x86_static.hpp index 6523f9c4..121db5be 100644 --- a/src/crypto/randomx/jit_compiler_x86_static.hpp +++ b/src/crypto/randomx/jit_compiler_x86_static.hpp @@ -44,6 +44,11 @@ extern "C" { void randomx_program_loop_store(); void randomx_program_loop_end(); void randomx_dataset_init(); + void randomx_dataset_init_avx2_prologue(); + void randomx_dataset_init_avx2_loop_end(); + void randomx_dataset_init_avx2_epilogue(); + void randomx_dataset_init_avx2_ssh_load(); + void randomx_dataset_init_avx2_ssh_prefetch(); void randomx_program_epilogue(); void randomx_sshash_load(); void randomx_sshash_prefetch(); diff --git a/src/crypto/rx/RxDataset.cpp b/src/crypto/rx/RxDataset.cpp index 410a2f34..b47285a3 100644 --- a/src/crypto/rx/RxDataset.cpp +++ b/src/crypto/rx/RxDataset.cpp @@ -19,6 +19,7 @@ #include "crypto/rx/RxDataset.h" +#include "backend/cpu/Cpu.h" #include "base/io/log/Log.h" #include "base/io/log/Tags.h" #include "base/kernel/Platform.h" @@ -39,7 +40,13 @@ static void init_dataset_wrapper(randomx_dataset *dataset, randomx_cache *cache, { Platform::setThreadPriority(priority); - randomx_init_dataset(dataset, cache, startItem, itemCount); + if (Cpu::info()->hasAVX2() && (itemCount % 5)) { + randomx_init_dataset(dataset, cache, startItem, itemCount - (itemCount % 5)); + randomx_init_dataset(dataset, cache, startItem + itemCount - 5, 5); + } + else { + randomx_init_dataset(dataset, cache, startItem, itemCount); + } }