/* Copyright (c) 2018-2020, tevador Copyright (c) 2019-2020, SChernykh Copyright (c) 2019-2020, XMRig , All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ #include #include #include #include #include "crypto/randomx/jit_compiler_x86.hpp" #include "backend/cpu/Cpu.h" #include "crypto/common/VirtualMemory.h" #include "crypto/randomx/jit_compiler_x86_static.hpp" #include "crypto/randomx/program.hpp" #include "crypto/randomx/reciprocal.h" #include "crypto/randomx/superscalar.hpp" #include "crypto/randomx/virtual_memory.hpp" #include "crypto/rx/Profiler.h" #ifdef XMRIG_FIX_RYZEN # include "crypto/rx/Rx.h" #endif #ifdef _MSC_VER # include #endif static bool hugePagesJIT = false; void randomx_set_huge_pages_jit(bool hugePages) { hugePagesJIT = hugePages; } namespace randomx { /* REGISTER ALLOCATION: ; rax -> temporary ; rbx -> iteration counter "ic" ; rcx -> temporary ; rdx -> temporary ; rsi -> scratchpad pointer ; rdi -> dataset pointer ; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits) ; rsp -> stack pointer ; r8 -> "r0" ; r9 -> "r1" ; r10 -> "r2" ; r11 -> "r3" ; r12 -> "r4" ; r13 -> "r5" ; r14 -> "r6" ; r15 -> "r7" ; xmm0 -> "f0" ; xmm1 -> "f1" ; xmm2 -> "f2" ; xmm3 -> "f3" ; xmm4 -> "e0" ; xmm5 -> "e1" ; xmm6 -> "e2" ; xmm7 -> "e3" ; xmm8 -> "a0" ; xmm9 -> "a1" ; xmm10 -> "a2" ; xmm11 -> "a3" ; xmm12 -> temporary ; xmm13 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff ; xmm14 -> E 'or' mask = 0x3*00000000******3*00000000****** ; xmm15 -> scale mask = 0x81f000000000000081f0000000000000 */ # if defined(_MSC_VER) && (defined(_DEBUG) || defined (RELWITHDEBINFO)) #define ADDR(x) ((((uint8_t*)&x)[0] == 0xE9) ? (((uint8_t*)&x) + *(const int32_t*)(((uint8_t*)&x) + 1) + 5) : ((uint8_t*)&x)) # else #define ADDR(x) ((uint8_t*)&x) # endif #define codePrefetchScratchpad ADDR(randomx_prefetch_scratchpad) #define codePrefetchScratchpadEnd ADDR(randomx_prefetch_scratchpad_end) #define codePrologue ADDR(randomx_program_prologue) #define codeLoopBegin ADDR(randomx_program_loop_begin) #define codeLoopLoad ADDR(randomx_program_loop_load) #define codeLoopLoadXOP ADDR(randomx_program_loop_load_xop) #define codeProgamStart ADDR(randomx_program_start) #define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init) #define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin) #define codeDatasetInit ADDR(randomx_dataset_init) #define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue) #define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end) #define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue) #define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load) #define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch) #define codeLoopStore ADDR(randomx_program_loop_store) #define codeLoopEnd ADDR(randomx_program_loop_end) #define codeEpilogue ADDR(randomx_program_epilogue) #define codeProgramEnd ADDR(randomx_program_end) #define codeShhLoad ADDR(randomx_sshash_load) #define codeShhPrefetch ADDR(randomx_sshash_prefetch) #define codeShhEnd ADDR(randomx_sshash_end) #define codeShhInit ADDR(randomx_sshash_init) #define prefetchScratchpadSize (codePrefetchScratchpadEnd - codePrefetchScratchpad) #define prologueSize (codeLoopBegin - codePrologue) #define loopLoadSize (codeLoopLoadXOP - codeLoopLoad) #define loopLoadXOPSize (codeProgamStart - codeLoopLoadXOP) #define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit) #define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin) #define loopStoreSize (codeLoopEnd - codeLoopStore) #define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit) #define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue) #define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end) #define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue) #define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load) #define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch) #define epilogueSize (codeShhLoad - codeEpilogue) #define codeSshLoadSize (codeShhPrefetch - codeShhLoad) #define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch) #define codeSshInitSize (codeProgramEnd - codeShhInit) #define epilogueOffset ((CodeSize - epilogueSize) & ~63) constexpr int32_t superScalarHashOffset = 32768; static const uint8_t NOP1[] = { 0x90 }; static const uint8_t NOP2[] = { 0x66, 0x90 }; static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 }; static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 }; static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 }; static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 }; static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 }; static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 }; static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 }; static const uint8_t JMP_ALIGN_PREFIX[14][16] = { {}, {0x2E}, {0x2E, 0x2E}, {0x2E, 0x2E, 0x2E}, {0x2E, 0x2E, 0x2E, 0x2E}, {0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, {0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E}, }; static inline uint8_t* alignToPage(uint8_t* p, size_t pageSize) { size_t k = (size_t) p; k -= k % pageSize; return (uint8_t*) k; } size_t JitCompilerX86::getCodeSize() { return codePos < prologueSize ? 0 : codePos - prologueSize; } void JitCompilerX86::enableWriting() const { uint8_t* p1 = alignToPage(code, 4096); uint8_t* p2 = code + CodeSize; xmrig::VirtualMemory::protectRW(p1, p2 - p1); } void JitCompilerX86::enableExecution() const { uint8_t* p1 = alignToPage(code, 4096); uint8_t* p2 = code + CodeSize; xmrig::VirtualMemory::protectRX(p1, p2 - p1); } # ifdef _MSC_VER static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); } # else static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return (a << shift) | (a >> (-shift & 31)); } # endif static std::atomic codeOffset; constexpr size_t codeOffsetIncrement = 59 * 64; JitCompilerX86::JitCompilerX86(bool hugePagesEnable) { BranchesWithin32B = xmrig::Cpu::info()->jccErratum(); hasAVX = xmrig::Cpu::info()->hasAVX(); hasAVX2 = xmrig::Cpu::info()->hasAVX2(); // Set to false by default initDatasetAVX2 = false; xmrig::ICpuInfo::Vendor vendor = xmrig::Cpu::info()->vendor(); xmrig::ICpuInfo::Arch arch = xmrig::Cpu::info()->arch(); if (vendor == xmrig::ICpuInfo::VENDOR_INTEL) { // AVX2 init is faster on Intel CPUs without HT initDatasetAVX2 = xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads(); } else if (vendor == xmrig::ICpuInfo::VENDOR_AMD) { switch (arch) { case xmrig::ICpuInfo::ARCH_ZEN: case xmrig::ICpuInfo::ARCH_ZEN_PLUS: // AVX2 init is slow on Zen/Zen+ initDatasetAVX2 = false; break; case xmrig::ICpuInfo::ARCH_ZEN2: // AVX2 init is faster on Zen2 without SMT (mobile CPUs) initDatasetAVX2 = xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads(); break; case xmrig::ICpuInfo::ARCH_ZEN3: // AVX2 init is faster on Zen3 initDatasetAVX2 = true; break; } } // Sorry low-end Intel CPUs if (!hasAVX2) { initDatasetAVX2 = false; } hasXOP = xmrig::Cpu::info()->hasXOP(); allocatedSize = initDatasetAVX2 ? (CodeSize * 4) : (CodeSize * 2); allocatedCode = static_cast(allocExecutableMemory(allocatedSize, # ifdef XMRIG_SECURE_JIT false # else hugePagesJIT && hugePagesEnable # endif )); // Shift code base address to improve caching - all threads will use different L2/L3 cache sets code = allocatedCode + (codeOffset.fetch_add(codeOffsetIncrement) % CodeSize); memcpy(code, codePrologue, prologueSize); if (hasXOP) { memcpy(code + prologueSize, codeLoopLoadXOP, loopLoadXOPSize); } else { memcpy(code + prologueSize, codeLoopLoad, loopLoadSize); } memcpy(code + epilogueOffset, codeEpilogue, epilogueSize); codePosFirst = prologueSize + (hasXOP ? loopLoadXOPSize : loopLoadSize); # ifdef XMRIG_FIX_RYZEN mainLoopBounds.first = code + prologueSize; mainLoopBounds.second = code + epilogueOffset; # endif } JitCompilerX86::~JitCompilerX86() { codeOffset.fetch_sub(codeOffsetIncrement); freePagedMemory(allocatedCode, allocatedSize); } void JitCompilerX86::prepare() { for (size_t i = 0; i < sizeof(engine); i += 64) rx_prefetch_nta((const char*)(&engine) + i); for (size_t i = 0; i < sizeof(RandomX_CurrentConfig); i += 64) rx_prefetch_nta((const char*)(&RandomX_CurrentConfig) + i); } void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) { PROFILE_SCOPE(RandomX_JIT_compile); # ifdef XMRIG_SECURE_JIT enableWriting(); # endif vm_flags = flags; generateProgramPrologue(prog, pcfg); uint8_t* p; uint32_t n; if (flags & RANDOMX_FLAG_AMD) { p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked; n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize; } else { p = RandomX_CurrentConfig.codeReadDatasetTweaked; n = RandomX_CurrentConfig.codeReadDatasetTweakedSize; } memcpy(code + codePos, p, n); codePos += n; generateProgramEpilogue(prog, pcfg); } void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) { generateProgramPrologue(prog, pcfg); emit(RandomX_CurrentConfig.codeReadDatasetLightSshInitTweaked, readDatasetLightInitSize, code, codePos); *(uint32_t*)(code + codePos) = 0xc381; codePos += 2; emit32(datasetOffset / CacheLineSize, code, codePos); emitByte(0xe8, code, codePos); emit32(superScalarHashOffset - (codePos + 4), code, codePos); emit(codeReadDatasetLightSshFin, readDatasetLightFinSize, code, codePos); generateProgramEpilogue(prog, pcfg); } template void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) { uint8_t* p = code; if (initDatasetAVX2) { codePos = 0; emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos); for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) { SuperscalarProgram& prog = programs[j]; uint32_t pos = codePos; for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { generateSuperscalarCode(prog(i), p, pos); } codePos = pos; emit(codeShhLoad, codeSshLoadSize, code, codePos); emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos); if (j < RandomX_CurrentConfig.CacheAccesses - 1) { *(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast(prog.getAddressRegister()) << 16); codePos += 3; emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos); uint8_t* p = code + codePos; emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos); p[3] += prog.getAddressRegister() << 3; } } emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos); // Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label constexpr int32_t prologue_size = 320; *(int32_t*)(code + codePos - 4) = prologue_size - codePos; emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos); return; } memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize); codePos = superScalarHashOffset + codeSshInitSize; for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) { SuperscalarProgram& prog = programs[j]; uint32_t pos = codePos; for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) { generateSuperscalarCode(prog(i), p, pos); } codePos = pos; emit(codeShhLoad, codeSshLoadSize, code, codePos); if (j < RandomX_CurrentConfig.CacheAccesses - 1) { *(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast(prog.getAddressRegister()) << 16); codePos += 3; emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos); } } emitByte(0xc3, code, codePos); } template void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]); void JitCompilerX86::generateDatasetInitCode() { // AVX2 code is generated in generateSuperscalarHash() if (!initDatasetAVX2) { memcpy(code, codeDatasetInit, datasetInitSize); } } void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) { codePos = ADDR(randomx_program_prologue_first_load) - ADDR(randomx_program_prologue); code[codePos + 2] = 0xc0 + pcfg.readReg0; code[codePos + 5] = 0xc0 + pcfg.readReg1; *(uint32_t*)(code + codePos + 10) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated; *(uint32_t*)(code + codePos + 20) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated; if (hasAVX) { uint32_t* p = (uint32_t*)(code + codePos + 67); *p = (*p & 0xFF000000U) | 0x0077F8C5U; } # ifdef XMRIG_FIX_RYZEN xmrig::Rx::setMainLoopBounds(mainLoopBounds); # endif memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask)); codePos = codePosFirst; //mark all registers as used uint64_t* r = (uint64_t*)registerUsage; uint64_t k = codePos; k |= k << 32; for (unsigned j = 0; j < RegistersCount / 2; ++j) { r[j] = k; } for (int i = 0, n = static_cast(RandomX_CurrentConfig.ProgramSize); i < n; i += 4) { Instruction& instr1 = prog(i); Instruction& instr2 = prog(i + 1); Instruction& instr3 = prog(i + 2); Instruction& instr4 = prog(i + 3); InstructionGeneratorX86 gen1 = engine[instr1.opcode]; InstructionGeneratorX86 gen2 = engine[instr2.opcode]; InstructionGeneratorX86 gen3 = engine[instr3.opcode]; InstructionGeneratorX86 gen4 = engine[instr4.opcode]; (*gen1)(this, instr1); (*gen2)(this, instr2); (*gen3)(this, instr3); (*gen4)(this, instr4); } *(uint64_t*)(code + codePos) = 0xc03341c08b41ull + (static_cast(pcfg.readReg2) << 16) + (static_cast(pcfg.readReg3) << 40); codePos += 6; } void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) { *(uint64_t*)(code + codePos) = 0xc03349c08b49ull + (static_cast(pcfg.readReg0) << 16) + (static_cast(pcfg.readReg1) << 40); codePos += 6; emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos); memcpy(code + codePos, codeLoopStore, loopStoreSize); codePos += loopStoreSize; if (BranchesWithin32B) { const uint32_t branch_begin = static_cast(codePos); const uint32_t branch_end = static_cast(branch_begin + 9); // If the jump crosses or touches 32-byte boundary, align it if ((branch_begin ^ branch_end) >= 32) { uint32_t alignment_size = 32 - (branch_begin & 31); if (alignment_size > 8) { emit(NOPX[alignment_size - 9], alignment_size - 8, code, codePos); alignment_size = 8; } emit(NOPX[alignment_size - 1], alignment_size, code, codePos); } } *(uint64_t*)(code + codePos) = 0x850f01eb83ull; codePos += 5; emit32(prologueSize - codePos - 4, code, codePos); emitByte(0xe9, code, codePos); emit32(epilogueOffset - codePos - 4, code, codePos); } template FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) { switch ((SuperscalarInstructionType)instr.opcode) { case randomx::SuperscalarInstructionType::ISUB_R: *(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16); codePos += 3; if (AVX2) { emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); } break; case randomx::SuperscalarInstructionType::IXOR_R: *(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16); codePos += 3; if (AVX2) { emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); } break; case randomx::SuperscalarInstructionType::IADD_RS: emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos); if (AVX2) { if (instr.getModShift()) { static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 }; uint8_t* p = code + codePos; emit(t, code, codePos); p[3] += instr.src; p[4] = instr.getModShift(); p[8] += instr.dst * 9; } else { emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos); } } break; case randomx::SuperscalarInstructionType::IMUL_R: emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos); if (AVX2) { static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x20, 0xC5, 0xB5, 0x73, 0xD0, 0x20, 0xC5, 0x7D, 0xF4, 0xD0, 0xC5, 0x35, 0xF4, 0xD8, 0xC5, 0xBD, 0xF4, 0xC0, 0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20, 0xC5, 0xFD, 0x73, 0xF0, 0x20, 0xC4, 0x41, 0x2D, 0xD4, 0xD3, 0xC5, 0xAD, 0xD4, 0xC0 }; uint8_t* p = code + codePos; emit(t, code, codePos); p[3] += instr.dst; p[8] += instr.src; p[11] -= instr.dst * 8; p[13] += instr.src; p[17] += instr.dst; p[21] += instr.dst * 8 + instr.src; p[29] -= instr.dst * 8; p[31] += instr.dst; p[41] += instr.dst * 9; } break; case randomx::SuperscalarInstructionType::IROR_C: { const uint32_t shift = instr.getImm32() & 63; emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos); if (AVX2) { static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 }; uint8_t* p = code + codePos; emit(t, code, codePos); p[3] += instr.dst; p[4] = shift; p[8] += instr.dst; p[9] = 64 - shift; p[14] += instr.dst * 8; } } break; case randomx::SuperscalarInstructionType::IADD_C7: case randomx::SuperscalarInstructionType::IADD_C8: case randomx::SuperscalarInstructionType::IADD_C9: if (AVX2) { static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 }; uint8_t* p = code + codePos; emit(t, code, codePos); *(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32()); p[12] += instr.dst * 8; p[24] -= instr.dst * 8; p[26] += instr.dst * 8; } else { *(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16); codePos += 3; emit32(instr.getImm32(), code, codePos); } break; case randomx::SuperscalarInstructionType::IXOR_C7: case randomx::SuperscalarInstructionType::IXOR_C8: case randomx::SuperscalarInstructionType::IXOR_C9: if (AVX2) { static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 }; uint8_t* p = code + codePos; emit(t, code, codePos); *(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32()); p[12] += instr.dst * 8; p[24] -= instr.dst * 8; p[26] += instr.dst * 8; } else { *(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16); codePos += 3; emit32(instr.getImm32(), code, codePos); } break; case randomx::SuperscalarInstructionType::IMULH_R: *(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16); codePos += 3; *(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16); codePos += 3; *(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19); codePos += 3; if (AVX2) { static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x20, 0xC5, 0xB5, 0x73, 0xD0, 0x20, 0xC5, 0x7D, 0xF4, 0xD0, 0xC5, 0x3D, 0xF4, 0xD8, 0xC4, 0x41, 0x7D, 0xF4, 0xE1, 0xC4, 0xC1, 0x3D, 0xF4, 0xC1, 0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20, 0xC4, 0x41, 0x25, 0xEF, 0xC6, 0xC4, 0x41, 0x25, 0xD4, 0xDC, 0xC4, 0x41, 0x25, 0xD4, 0xDA, 0xC4, 0x41, 0x25, 0xEF, 0xCE, 0xC4, 0x42, 0x3D, 0x37, 0xC1, 0xC4, 0x41, 0x3D, 0xDB, 0xC7, 0xC5, 0xBD, 0xD4, 0xC0, 0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20, 0xC5, 0xA5, 0xD4, 0xC0 }; uint8_t* p = code + codePos; emit(t, code, codePos); p[3] += instr.dst; p[8] += instr.src; p[11] -= instr.dst * 8; p[13] += instr.src; p[17] += instr.src; p[20] -= instr.dst * 8; p[27] += instr.dst * 8; p[67] += instr.dst * 9; p[77] += instr.dst * 9; } break; case randomx::SuperscalarInstructionType::ISMULH_R: *(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16); codePos += 3; *(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16); codePos += 3; *(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19); codePos += 3; if (AVX2) { static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x20, 0xC5, 0xB5, 0x73, 0xD0, 0x20, 0xC5, 0x7D, 0xF4, 0xD0, 0xC5, 0x3D, 0xF4, 0xD8, 0xC4, 0x41, 0x7D, 0xF4, 0xE1, 0xC4, 0x41, 0x3D, 0xF4, 0xE9, 0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20, 0xC4, 0x41, 0x25, 0xEF, 0xC6, 0xC4, 0x41, 0x25, 0xD4, 0xDC, 0xC4, 0x41, 0x25, 0xD4, 0xDA, 0xC4, 0x41, 0x25, 0xEF, 0xCE, 0xC4, 0x42, 0x3D, 0x37, 0xC1, 0xC4, 0x41, 0x3D, 0xDB, 0xC7, 0xC4, 0x41, 0x15, 0xD4, 0xE8, 0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20, 0xC4, 0x41, 0x15, 0xD4, 0xC3, 0xC4, 0x41, 0x35, 0xEF, 0xC9, 0xC4, 0x62, 0x35, 0x37, 0xD0, 0xC4, 0x62, 0x35, 0x37, 0xD8, 0xC5, 0x2D, 0xDB, 0xD0, 0xC5, 0x25, 0xDB, 0xD8, 0xC4, 0x41, 0x3D, 0xFB, 0xC2, 0xC4, 0xC1, 0x3D, 0xFB, 0xC3 }; uint8_t* p = code + codePos; emit(t, code, codePos); p[3] += instr.dst; p[8] += instr.src; p[11] -= instr.dst * 8; p[13] += instr.src; p[17] += instr.src; p[20] -= instr.dst * 8; p[89] += instr.dst; p[94] += instr.src; p[98] += instr.src; p[102] += instr.dst; p[112] += instr.dst * 8; } break; case randomx::SuperscalarInstructionType::IMUL_RCP: *(uint32_t*)(code + codePos) = 0x0000B848UL; codePos += 2; emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos); emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos); if (AVX2) { static const uint8_t t[] = { 0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF, 0xC5, 0xBD, 0x73, 0xD0, 0x20, 0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20, 0xC4, 0x41, 0x7D, 0xF4, 0xD4, 0xC5, 0x35, 0xF4, 0xD8, 0xC4, 0xC1, 0x3D, 0xF4, 0xC4, 0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20, 0xC5, 0xFD, 0x73, 0xF0, 0x20, 0xC4, 0x41, 0x2D, 0xD4, 0xD3, 0xC5, 0xAD, 0xD4, 0xC0 }; uint8_t* p = code + codePos; emit(t, code, codePos); p[12] += instr.dst; p[22] -= instr.dst * 8; p[28] += instr.dst; p[33] += instr.dst * 8; p[41] -= instr.dst * 8; p[43] += instr.dst; p[53] += instr.dst * 9; } break; default: UNREACHABLE; } } template void JitCompilerX86::generateSuperscalarCode(Instruction&, uint8_t*, uint32_t&); template void JitCompilerX86::generateSuperscalarCode(Instruction&, uint8_t*, uint32_t&); template FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) { *(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16); constexpr uint32_t add_table = 0x33333333u + (1u << (RegisterNeedsSib * 4)); codePos += (add_table >> (src * 4)) & 0xf; emit32(instr.getImm32(), code, codePos); if (rax) { emitByte(0x25, code, codePos); } else { *(uint32_t*)(code + codePos) = 0xe181; codePos += 2; } emit32(AddressMask[instr.getModMem()], code, codePos); } template void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos); template void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos); FORCE_INLINE void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, uint32_t& codePos) { const uint32_t dst = static_cast(instr.dst % RegistersCount) << 16; *(uint32_t*)(code + codePos) = 0x24808d41 + dst; codePos += (dst == (RegisterNeedsSib << 16)) ? 4 : 3; emit32(instr.getImm32(), code, codePos); emitByte(0x25, code, codePos); const uint32_t mask1 = AddressMask[instr.getModMem()]; const uint32_t mask2 = ScratchpadL3Mask; emit32((instr.mod < (StoreL3Condition << 4)) ? mask1 : mask2, code, codePos); } FORCE_INLINE void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, uint32_t& codePos) { emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos); } void JitCompilerX86::h_IADD_RS(const Instruction& instr) { uint32_t pos = codePos; uint8_t* const p = code + pos; const uint32_t dst = instr.dst % RegistersCount; const uint32_t sib = (instr.getModShift() << 6) | ((instr.src % RegistersCount) << 3) | dst; uint32_t k = 0x048d4f + (dst << 19); if (dst == RegisterNeedsDisplacement) k = 0xac8d4f; *(uint32_t*)(p) = k | (sib << 24); *(uint32_t*)(p + 4) = instr.getImm32(); pos += ((dst == RegisterNeedsDisplacement) ? 8 : 4); registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IADD_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); emit32(0x0604034c + (dst << 19), p, pos); } else { *(uint32_t*)(p + pos) = 0x86034c + (dst << 19); pos += 3; genAddressImm(instr, p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_ISUB_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint32_t*)(p + pos) = 0xc02b4d + (dst << 19) + (src << 16); pos += 3; } else { *(uint32_t*)(p + pos) = 0xe88149 + (dst << 16); pos += 3; emit32(instr.getImm32(), p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_ISUB_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); emit32(0x06042b4c + (dst << 19), p, pos); } else { *(uint32_t*)(p + pos) = 0x862b4c + (dst << 19); pos += 3; genAddressImm(instr, p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IMUL_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { emit32(0xc0af0f4d + ((dst * 8 + src) << 24), p, pos); } else { *(uint32_t*)(p + pos) = 0xc0694d + (((dst << 3) + dst) << 16); pos += 3; emit32(instr.getImm32(), p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IMUL_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); *(uint64_t*)(p + pos) = 0x0604af0f4cull + (dst << 27); pos += 5; } else { emit32(0x86af0f4c + (dst << 27), p, pos); genAddressImm(instr, p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IMULH_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; *(uint32_t*)(p + pos) = 0xc08b49 + (dst << 16); *(uint32_t*)(p + pos + 3) = 0xe0f749 + (src << 16); *(uint32_t*)(p + pos + 6) = 0xc28b4c + (dst << 19); pos += 9; registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IMULH_R_BMI2(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; *(uint32_t*)(p + pos) = 0xC4D08B49 + (dst << 16); *(uint32_t*)(p + pos + 4) = 0xC0F6FB42 + (dst << 27) + (src << 24); pos += 8; registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IMULH_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); *(uint64_t*)(p + pos) = 0x0e24f748c08b49ull + (dst << 16); pos += 7; } else { *(uint64_t*)(p + pos) = 0xa6f748c08b49ull + (dst << 16); pos += 6; genAddressImm(instr, p, pos); } *(uint32_t*)(p + pos) = 0xc28b4c + (dst << 19); pos += 3; registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IMULH_M_BMI2(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); *(uint32_t*)(p + pos) = static_cast(0xC4D08B49 + (dst << 16)); *(uint64_t*)(p + pos + 4) = 0x0E04F6FB62ULL + (dst << 27); pos += 9; } else { *(uint64_t*)(p + pos) = 0x86F6FB62C4D08B49ULL + (dst << 16) + (dst << 59); *(uint32_t*)(p + pos + 8) = instr.getImm32() & ScratchpadL3Mask; pos += 12; } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_ISMULH_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; *(uint64_t*)(p + pos) = 0x8b4ce8f749c08b49ull + (dst << 16) + (src << 40); pos += 8; emitByte(0xc2 + 8 * dst, p, pos); registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_ISMULH_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); *(uint64_t*)(p + pos) = 0x0e2cf748c08b49ull + (dst << 16); pos += 7; } else { *(uint64_t*)(p + pos) = 0xaef748c08b49ull + (dst << 16); pos += 6; genAddressImm(instr, p, pos); } *(uint32_t*)(p + pos) = 0xc28b4c + (dst << 19); pos += 3; registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IMUL_RCP(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; uint64_t divisor = instr.getImm32(); if (!isZeroOrPowerOf2(divisor)) { *(uint32_t*)(p + pos) = 0xb848; pos += 2; emit64(randomx_reciprocal_fast(divisor), p, pos); const uint32_t dst = instr.dst % RegistersCount; emit32(0xc0af0f4c + (dst << 27), p, pos); registerUsage[dst] = pos; } codePos = pos; } void JitCompilerX86::h_INEG_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t dst = instr.dst % RegistersCount; *(uint32_t*)(p + pos) = 0xd8f749 + (dst << 16); pos += 3; registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IXOR_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint32_t*)(p + pos) = 0xc0334d + (((dst << 3) + src) << 16); pos += 3; } else { const uint64_t imm = instr.getImm32(); *(uint64_t*)(p + pos) = (imm << 24) + 0xf08149 + (dst << 16); pos += 7; } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IXOR_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { genAddressReg(instr, src, p, pos); emit32(0x0604334c + (dst << 19), p, pos); } else { *(uint32_t*)(p + pos) = 0x86334c + (dst << 19); pos += 3; genAddressImm(instr, p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IROR_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint64_t*)(p + pos) = 0xc8d349c88b41ull + (src << 16) + (dst << 40); pos += 6; } else { *(uint32_t*)(p + pos) = 0xc8c149 + (dst << 16); pos += 3; emitByte(instr.getImm32() & 63, p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_IROL_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint64_t*)(p + pos) = 0xc0d349c88b41ull + (src << 16) + (dst << 40); pos += 6; } else { *(uint32_t*)(p + pos) = 0xc0c149 + (dst << 16); pos += 3; emitByte(instr.getImm32() & 63, p, pos); } registerUsage[dst] = pos; codePos = pos; } void JitCompilerX86::h_ISWAP_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegistersCount; if (src != dst) { *(uint32_t*)(p + pos) = 0xc0874d + (((dst << 3) + src) << 16); pos += 3; registerUsage[dst] = pos; registerUsage[src] = pos; } codePos = pos; } void JitCompilerX86::h_FSWAP_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t dst = instr.dst % RegistersCount; *(uint64_t*)(p + pos) = 0x01c0c60f66ull + (((dst << 3) + dst) << 24); pos += 5; codePos = pos; } void JitCompilerX86::h_FADD_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t dst = instr.dst % RegisterCountFlt; const uint64_t src = instr.src % RegisterCountFlt; *(uint64_t*)(p + pos) = 0xc0580f4166ull + (((dst << 3) + src) << 32); pos += 5; codePos = pos; } void JitCompilerX86::h_FADD_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, src, p, pos); *(uint64_t*)(p + pos) = 0x41660624e60f44f3ull; *(uint32_t*)(p + pos + 8) = 0xc4580f + (dst << 19); pos += 11; codePos = pos; } void JitCompilerX86::h_FSUB_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t dst = instr.dst % RegisterCountFlt; const uint64_t src = instr.src % RegisterCountFlt; *(uint64_t*)(p + pos) = 0xc05c0f4166ull + (((dst << 3) + src) << 32); pos += 5; codePos = pos; } void JitCompilerX86::h_FSUB_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint32_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, src, p, pos); *(uint64_t*)(p + pos) = 0x41660624e60f44f3ull; *(uint32_t*)(p + pos + 8) = 0xc45c0f + (dst << 19); pos += 11; codePos = pos; } void JitCompilerX86::h_FSCAL_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t dst = instr.dst % RegisterCountFlt; emit32(0xc7570f41 + (dst << 27), p, pos); codePos = pos; } void JitCompilerX86::h_FMUL_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t dst = instr.dst % RegisterCountFlt; const uint64_t src = instr.src % RegisterCountFlt; *(uint64_t*)(p + pos) = 0xe0590f4166ull + (((dst << 3) + src) << 32); pos += 5; codePos = pos; } void JitCompilerX86::h_FDIV_M(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; const uint64_t dst = instr.dst % RegisterCountFlt; genAddressReg(instr, src, p, pos); *(uint64_t*)(p + pos) = 0x0624e60f44f3ull; pos += 6; if (hasXOP) { *(uint64_t*)(p + pos) = 0xd0e6a218488full; pos += 6; } else { *(uint64_t*)(p + pos) = 0xe6560f45e5540f45ull; pos += 8; } *(uint64_t*)(p + pos) = 0xe45e0f4166ull + (dst << 35); pos += 5; codePos = pos; } void JitCompilerX86::h_FSQRT_R(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t dst = instr.dst % RegisterCountFlt; emit32(0xe4510f66 + (((dst << 3) + dst) << 24), p, pos); codePos = pos; } void JitCompilerX86::h_CFROUND(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint32_t src = instr.src % RegistersCount; *(uint32_t*)(p + pos) = 0x00C08B49 + (src << 16); const int rotate = (static_cast(instr.getImm32() & 63) - 2) & 63; *(uint32_t*)(p + pos + 3) = 0x00C8C148 + (rotate << 24); if (vm_flags & RANDOMX_FLAG_AMD) { *(uint64_t*)(p + pos + 7) = 0x742024443B0CE083ULL; *(uint64_t*)(p + pos + 15) = 0x8900EB0414AE0F0AULL; *(uint32_t*)(p + pos + 23) = 0x202444; pos += 26; } else { *(uint64_t*)(p + pos + 7) = 0x0414AE0F0CE083ULL; pos += 14; } codePos = pos; } void JitCompilerX86::h_CFROUND_BMI2(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const uint64_t src = instr.src % RegistersCount; const uint64_t rotate = (static_cast(instr.getImm32() & 63) - 2) & 63; *(uint64_t*)(p + pos) = 0xC0F0FBC3C4ULL | (src << 32) | (rotate << 40); if (vm_flags & RANDOMX_FLAG_AMD) { *(uint64_t*)(p + pos + 6) = 0x742024443B0CE083ULL; *(uint64_t*)(p + pos + 14) = 0x8900EB0414AE0F0AULL; *(uint32_t*)(p + pos + 22) = 0x202444; pos += 25; } else { *(uint64_t*)(p + pos + 6) = 0x0414AE0F0CE083ULL; pos += 13; } codePos = pos; } template void JitCompilerX86::h_CBRANCH(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; const int reg = instr.dst % RegistersCount; int32_t jmp_offset = registerUsage[reg] - (pos + 16); if (jccErratum) { const uint32_t branch_begin = static_cast(pos + 7); const uint32_t branch_end = static_cast(branch_begin + ((jmp_offset >= -128) ? 9 : 13)); // If the jump crosses or touches 32-byte boundary, align it if ((branch_begin ^ branch_end) >= 32) { const uint32_t alignment_size = 32 - (branch_begin & 31); jmp_offset -= alignment_size; emit(JMP_ALIGN_PREFIX[alignment_size], alignment_size, p, pos); } } *(uint32_t*)(p + pos) = 0x00c08149 + (reg << 16); const int shift = instr.getModCond(); const uint32_t or_mask = (1UL << RandomX_ConfigurationBase::JumpOffset) << shift; const uint32_t and_mask = rotl32(~static_cast(1UL << (RandomX_ConfigurationBase::JumpOffset - 1)), shift); *(uint32_t*)(p + pos + 3) = (instr.getImm32() | or_mask) & and_mask; *(uint32_t*)(p + pos + 7) = 0x00c0f749 + (reg << 16); *(uint32_t*)(p + pos + 10) = RandomX_ConfigurationBase::ConditionMask_Calculated << shift; pos += 14; if (jmp_offset >= -128) { *(uint32_t*)(p + pos) = 0x74 + (static_cast(jmp_offset) << 8); pos += 2; } else { *(uint64_t*)(p + pos) = 0x840f + (static_cast(jmp_offset - 4) << 16); pos += 6; } //mark all registers as used uint64_t* r = (uint64_t*) registerUsage; uint64_t k = pos; k |= k << 32; for (unsigned j = 0; j < RegistersCount / 2; ++j) { r[j] = k; } codePos = pos; } template void JitCompilerX86::h_CBRANCH(const Instruction&); template void JitCompilerX86::h_CBRANCH(const Instruction&); void JitCompilerX86::h_ISTORE(const Instruction& instr) { uint8_t* const p = code; uint32_t pos = codePos; genAddressRegDst(instr, p, pos); emit32(0x0604894c + (static_cast(instr.src % RegistersCount) << 19), p, pos); codePos = pos; } void JitCompilerX86::h_NOP(const Instruction& instr) { emitByte(0x90, code, codePos); } alignas(64) InstructionGeneratorX86 JitCompilerX86::engine[256] = {}; }