1448 lines
44 KiB
C++
1448 lines
44 KiB
C++
/*
|
|
Copyright (c) 2018-2020, tevador <tevador@gmail.com>
|
|
Copyright (c) 2019-2021, SChernykh <https://github.com/SChernykh>
|
|
Copyright (c) 2019-2021, XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
|
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of the copyright holder nor the
|
|
names of its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <stdexcept>
|
|
#include <cstring>
|
|
#include <climits>
|
|
#include <atomic>
|
|
|
|
#include "crypto/randomx/jit_compiler_x86.hpp"
|
|
#include "backend/cpu/Cpu.h"
|
|
#include "crypto/common/VirtualMemory.h"
|
|
#include "crypto/randomx/jit_compiler_x86_static.hpp"
|
|
#include "crypto/randomx/program.hpp"
|
|
#include "crypto/randomx/reciprocal.h"
|
|
#include "crypto/randomx/superscalar.hpp"
|
|
#include "crypto/randomx/virtual_memory.hpp"
|
|
#include "crypto/rx/Profiler.h"
|
|
|
|
#ifdef XMRIG_FIX_RYZEN
|
|
# include "crypto/rx/RxFix.h"
|
|
#endif
|
|
|
|
#ifdef _MSC_VER
|
|
# include <intrin.h>
|
|
#endif
|
|
|
|
static bool hugePagesJIT = false;
|
|
static int optimizedDatasetInit = -1;
|
|
|
|
void randomx_set_huge_pages_jit(bool hugePages)
|
|
{
|
|
hugePagesJIT = hugePages;
|
|
}
|
|
|
|
void randomx_set_optimized_dataset_init(int value)
|
|
{
|
|
optimizedDatasetInit = value;
|
|
}
|
|
|
|
namespace randomx {
|
|
/*
|
|
|
|
REGISTER ALLOCATION:
|
|
|
|
; rax -> temporary
|
|
; rbx -> iteration counter "ic"
|
|
; rcx -> temporary
|
|
; rdx -> temporary
|
|
; rsi -> scratchpad pointer
|
|
; rdi -> dataset pointer
|
|
; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits)
|
|
; rsp -> stack pointer
|
|
; r8 -> "r0"
|
|
; r9 -> "r1"
|
|
; r10 -> "r2"
|
|
; r11 -> "r3"
|
|
; r12 -> "r4"
|
|
; r13 -> "r5"
|
|
; r14 -> "r6"
|
|
; r15 -> "r7"
|
|
; xmm0 -> "f0"
|
|
; xmm1 -> "f1"
|
|
; xmm2 -> "f2"
|
|
; xmm3 -> "f3"
|
|
; xmm4 -> "e0"
|
|
; xmm5 -> "e1"
|
|
; xmm6 -> "e2"
|
|
; xmm7 -> "e3"
|
|
; xmm8 -> "a0"
|
|
; xmm9 -> "a1"
|
|
; xmm10 -> "a2"
|
|
; xmm11 -> "a3"
|
|
; xmm12 -> temporary
|
|
; xmm13 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
|
|
; xmm14 -> E 'or' mask = 0x3*00000000******3*00000000******
|
|
; xmm15 -> scale mask = 0x81f000000000000081f0000000000000
|
|
|
|
*/
|
|
|
|
# if defined(_MSC_VER) && (defined(_DEBUG) || defined (RELWITHDEBINFO))
|
|
#define ADDR(x) ((((uint8_t*)&x)[0] == 0xE9) ? (((uint8_t*)&x) + *(const int32_t*)(((uint8_t*)&x) + 1) + 5) : ((uint8_t*)&x))
|
|
# else
|
|
#define ADDR(x) ((uint8_t*)&x)
|
|
# endif
|
|
|
|
#define codePrologue ADDR(randomx_program_prologue)
|
|
#define codeLoopBegin ADDR(randomx_program_loop_begin)
|
|
#define codeLoopLoad ADDR(randomx_program_loop_load)
|
|
#define codeLoopLoadXOP ADDR(randomx_program_loop_load_xop)
|
|
#define codeProgramStart ADDR(randomx_program_start)
|
|
#define codeReadDataset ADDR(randomx_program_read_dataset)
|
|
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
|
|
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
|
|
#define codeDatasetInit ADDR(randomx_dataset_init)
|
|
#define codeDatasetInitAVX2Prologue ADDR(randomx_dataset_init_avx2_prologue)
|
|
#define codeDatasetInitAVX2LoopEnd ADDR(randomx_dataset_init_avx2_loop_end)
|
|
#define codeDatasetInitAVX2Epilogue ADDR(randomx_dataset_init_avx2_epilogue)
|
|
#define codeDatasetInitAVX2SshLoad ADDR(randomx_dataset_init_avx2_ssh_load)
|
|
#define codeDatasetInitAVX2SshPrefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
|
|
#define codeLoopStore ADDR(randomx_program_loop_store)
|
|
#define codeLoopEnd ADDR(randomx_program_loop_end)
|
|
#define codeEpilogue ADDR(randomx_program_epilogue)
|
|
#define codeProgramEnd ADDR(randomx_program_end)
|
|
#define codeSshLoad ADDR(randomx_sshash_load)
|
|
#define codeSshPrefetch ADDR(randomx_sshash_prefetch)
|
|
#define codeSshEnd ADDR(randomx_sshash_end)
|
|
#define codeSshInit ADDR(randomx_sshash_init)
|
|
|
|
#define prologueSize (codeLoopBegin - codePrologue)
|
|
#define loopLoadSize (codeLoopLoadXOP - codeLoopLoad)
|
|
#define loopLoadXOPSize (codeProgramStart - codeLoopLoadXOP)
|
|
#define readDatasetSize (codeReadDatasetLightSshInit - codeReadDataset)
|
|
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
|
|
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
|
|
#define loopStoreSize (codeLoopEnd - codeLoopStore)
|
|
#define datasetInitSize (codeDatasetInitAVX2Prologue - codeDatasetInit)
|
|
#define datasetInitAVX2PrologueSize (codeDatasetInitAVX2LoopEnd - codeDatasetInitAVX2Prologue)
|
|
#define datasetInitAVX2LoopEndSize (codeDatasetInitAVX2Epilogue - codeDatasetInitAVX2LoopEnd)
|
|
#define datasetInitAVX2EpilogueSize (codeDatasetInitAVX2SshLoad - codeDatasetInitAVX2Epilogue)
|
|
#define datasetInitAVX2SshLoadSize (codeDatasetInitAVX2SshPrefetch - codeDatasetInitAVX2SshLoad)
|
|
#define datasetInitAVX2SshPrefetchSize (codeEpilogue - codeDatasetInitAVX2SshPrefetch)
|
|
#define epilogueSize (codeSshLoad - codeEpilogue)
|
|
#define codeSshLoadSize (codeSshPrefetch - codeSshLoad)
|
|
#define codeSshPrefetchSize (codeSshEnd - codeSshPrefetch)
|
|
#define codeSshInitSize (codeProgramEnd - codeSshInit)
|
|
|
|
#define epilogueOffset ((CodeSize - epilogueSize) & ~63)
|
|
|
|
constexpr int32_t superScalarHashOffset = 32768;
|
|
|
|
static const uint8_t NOP1[] = { 0x90 };
|
|
static const uint8_t NOP2[] = { 0x66, 0x90 };
|
|
static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 };
|
|
static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 };
|
|
static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
|
|
static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 };
|
|
static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 };
|
|
static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
|
|
static const uint8_t NOP9[] = { 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
|
|
|
|
static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8, NOP9 };
|
|
|
|
static const uint8_t NOP13[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x44, 0x00, 0x00 };
|
|
static const uint8_t NOP14[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 };
|
|
static const uint8_t NOP25[] = { 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
|
|
static const uint8_t NOP26[] = { 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x66, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00, 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
|
|
|
|
static const uint8_t JMP_ALIGN_PREFIX[14][16] = {
|
|
{},
|
|
{0x2E},
|
|
{0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
};
|
|
|
|
static inline uint8_t* alignToPage(uint8_t* p, size_t pageSize) {
|
|
size_t k = (size_t) p;
|
|
k -= k % pageSize;
|
|
return (uint8_t*) k;
|
|
}
|
|
|
|
size_t JitCompilerX86::getCodeSize() {
|
|
return codePos < prologueSize ? 0 : codePos - prologueSize;
|
|
}
|
|
|
|
void JitCompilerX86::enableWriting() const {
|
|
uint8_t* p1 = alignToPage(code, 4096);
|
|
uint8_t* p2 = code + CodeSize;
|
|
xmrig::VirtualMemory::protectRW(p1, p2 - p1);
|
|
}
|
|
|
|
void JitCompilerX86::enableExecution() const {
|
|
uint8_t* p1 = alignToPage(code, 4096);
|
|
uint8_t* p2 = code + CodeSize;
|
|
xmrig::VirtualMemory::protectRX(p1, p2 - p1);
|
|
}
|
|
|
|
# ifdef _MSC_VER
|
|
static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); }
|
|
# else
|
|
static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return (a << shift) | (a >> (-shift & 31)); }
|
|
# endif
|
|
|
|
static std::atomic<size_t> codeOffset;
|
|
constexpr size_t codeOffsetIncrement = 59 * 64;
|
|
|
|
JitCompilerX86::JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable) {
|
|
BranchesWithin32B = xmrig::Cpu::info()->jccErratum();
|
|
|
|
hasAVX = xmrig::Cpu::info()->hasAVX();
|
|
hasAVX2 = xmrig::Cpu::info()->hasAVX2();
|
|
|
|
// Disable by default
|
|
initDatasetAVX2 = false;
|
|
|
|
if (optimizedInitDatasetEnable) {
|
|
// Dataset init using AVX2:
|
|
// -1 = Auto detect
|
|
// 0 = Always disabled
|
|
// +1 = Always enabled
|
|
if (optimizedDatasetInit > 0) {
|
|
initDatasetAVX2 = true;
|
|
}
|
|
else if (optimizedDatasetInit < 0) {
|
|
xmrig::ICpuInfo::Vendor vendor = xmrig::Cpu::info()->vendor();
|
|
xmrig::ICpuInfo::Arch arch = xmrig::Cpu::info()->arch();
|
|
|
|
if (vendor == xmrig::ICpuInfo::VENDOR_INTEL) {
|
|
// AVX2 init is faster on Intel CPUs without HT
|
|
initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads());
|
|
}
|
|
else if (vendor == xmrig::ICpuInfo::VENDOR_AMD) {
|
|
switch (arch) {
|
|
case xmrig::ICpuInfo::ARCH_ZEN:
|
|
case xmrig::ICpuInfo::ARCH_ZEN_PLUS:
|
|
default:
|
|
// AVX2 init is slower on Zen/Zen+
|
|
// Also disable it for other unknown architectures
|
|
initDatasetAVX2 = false;
|
|
break;
|
|
case xmrig::ICpuInfo::ARCH_ZEN2:
|
|
// AVX2 init is faster on Zen2 without SMT (mobile CPUs)
|
|
initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads());
|
|
break;
|
|
case xmrig::ICpuInfo::ARCH_ZEN3:
|
|
// AVX2 init is faster on Zen3
|
|
initDatasetAVX2 = true;
|
|
break;
|
|
case xmrig::ICpuInfo::ARCH_ZEN4:
|
|
// AVX2 init is slower on Zen4
|
|
initDatasetAVX2 = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sorry, low-end Intel CPUs
|
|
if (!hasAVX2) {
|
|
initDatasetAVX2 = false;
|
|
}
|
|
|
|
hasXOP = xmrig::Cpu::info()->hasXOP();
|
|
|
|
allocatedSize = initDatasetAVX2 ? (CodeSize * 4) : (CodeSize * 2);
|
|
allocatedCode = static_cast<uint8_t*>(allocExecutableMemory(allocatedSize,
|
|
# ifdef XMRIG_SECURE_JIT
|
|
false
|
|
# else
|
|
hugePagesJIT && hugePagesEnable
|
|
# endif
|
|
));
|
|
|
|
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
|
|
code = allocatedCode + (codeOffset.fetch_add(codeOffsetIncrement) % CodeSize);
|
|
|
|
memcpy(code, codePrologue, prologueSize);
|
|
if (hasXOP) {
|
|
memcpy(code + prologueSize, codeLoopLoadXOP, loopLoadXOPSize);
|
|
}
|
|
else {
|
|
memcpy(code + prologueSize, codeLoopLoad, loopLoadSize);
|
|
}
|
|
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
|
|
|
|
codePosFirst = prologueSize + (hasXOP ? loopLoadXOPSize : loopLoadSize);
|
|
|
|
# ifdef XMRIG_FIX_RYZEN
|
|
mainLoopBounds.first = code + prologueSize;
|
|
mainLoopBounds.second = code + epilogueOffset;
|
|
# endif
|
|
}
|
|
|
|
JitCompilerX86::~JitCompilerX86() {
|
|
codeOffset.fetch_sub(codeOffsetIncrement);
|
|
freePagedMemory(allocatedCode, allocatedSize);
|
|
}
|
|
|
|
template<size_t N>
|
|
static FORCE_INLINE void prefetch_data(const void* data) {
|
|
rx_prefetch_nta(data);
|
|
prefetch_data<N - 1>(reinterpret_cast<const char*>(data) + 64);
|
|
}
|
|
|
|
template<> FORCE_INLINE void prefetch_data<0>(const void*) {}
|
|
|
|
template<typename T> static FORCE_INLINE void prefetch_data(const T& data) { prefetch_data<(sizeof(T) + 63) / 64>(&data); }
|
|
|
|
void JitCompilerX86::prepare() {
|
|
prefetch_data(engine);
|
|
prefetch_data(RandomX_CurrentConfig);
|
|
}
|
|
|
|
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
|
|
PROFILE_SCOPE(RandomX_JIT_compile);
|
|
|
|
# ifdef XMRIG_SECURE_JIT
|
|
enableWriting();
|
|
# endif
|
|
|
|
vm_flags = flags;
|
|
|
|
generateProgramPrologue(prog, pcfg);
|
|
emit(codeReadDataset, readDatasetSize, code, codePos);
|
|
generateProgramEpilogue(prog, pcfg);
|
|
}
|
|
|
|
void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
|
|
generateProgramPrologue(prog, pcfg);
|
|
emit(codeReadDatasetLightSshInit, readDatasetLightInitSize, code, codePos);
|
|
*(uint32_t*)(code + codePos) = 0xc381;
|
|
codePos += 2;
|
|
emit32(datasetOffset / CacheLineSize, code, codePos);
|
|
emitByte(0xe8, code, codePos);
|
|
emit32(superScalarHashOffset - (codePos + 4), code, codePos);
|
|
emit(codeReadDatasetLightSshFin, readDatasetLightFinSize, code, codePos);
|
|
generateProgramEpilogue(prog, pcfg);
|
|
}
|
|
|
|
template<size_t N>
|
|
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) {
|
|
uint8_t* p = code;
|
|
if (initDatasetAVX2) {
|
|
codePos = 0;
|
|
emit(codeDatasetInitAVX2Prologue, datasetInitAVX2PrologueSize, code, codePos);
|
|
|
|
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
|
|
SuperscalarProgram& prog = programs[j];
|
|
uint32_t pos = codePos;
|
|
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
|
|
generateSuperscalarCode<true>(prog(i), p, pos);
|
|
}
|
|
codePos = pos;
|
|
emit(codeSshLoad, codeSshLoadSize, code, codePos);
|
|
emit(codeDatasetInitAVX2SshLoad, datasetInitAVX2SshLoadSize, code, codePos);
|
|
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
|
|
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
|
|
codePos += 3;
|
|
emit(RandomX_CurrentConfig.codeSshPrefetchTweaked, codeSshPrefetchSize, code, codePos);
|
|
uint8_t* p = code + codePos;
|
|
emit(codeDatasetInitAVX2SshPrefetch, datasetInitAVX2SshPrefetchSize, code, codePos);
|
|
p[3] += prog.getAddressRegister() << 3;
|
|
}
|
|
}
|
|
|
|
emit(codeDatasetInitAVX2LoopEnd, datasetInitAVX2LoopEndSize, code, codePos);
|
|
|
|
// Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label
|
|
constexpr int32_t prologue_size = 320;
|
|
*(int32_t*)(code + codePos - 4) = prologue_size - codePos;
|
|
|
|
emit(codeDatasetInitAVX2Epilogue, datasetInitAVX2EpilogueSize, code, codePos);
|
|
return;
|
|
}
|
|
|
|
memcpy(code + superScalarHashOffset, codeSshInit, codeSshInitSize);
|
|
codePos = superScalarHashOffset + codeSshInitSize;
|
|
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
|
|
SuperscalarProgram& prog = programs[j];
|
|
uint32_t pos = codePos;
|
|
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
|
|
generateSuperscalarCode<false>(prog(i), p, pos);
|
|
}
|
|
codePos = pos;
|
|
emit(codeSshLoad, codeSshLoadSize, code, codePos);
|
|
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
|
|
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
|
|
codePos += 3;
|
|
emit(RandomX_CurrentConfig.codeSshPrefetchTweaked, codeSshPrefetchSize, code, codePos);
|
|
}
|
|
}
|
|
emitByte(0xc3, code, codePos);
|
|
}
|
|
|
|
template
|
|
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]);
|
|
|
|
void JitCompilerX86::generateDatasetInitCode() {
|
|
// AVX2 code is generated in generateSuperscalarHash()
|
|
if (!initDatasetAVX2) {
|
|
memcpy(code, codeDatasetInit, datasetInitSize);
|
|
}
|
|
}
|
|
|
|
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
|
|
codePos = ADDR(randomx_program_prologue_first_load) - ADDR(randomx_program_prologue);
|
|
*(uint32_t*)(code + codePos + 4) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
|
*(uint32_t*)(code + codePos + 14) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
|
if (hasAVX) {
|
|
uint32_t* p = (uint32_t*)(code + codePos + 61);
|
|
*p = (*p & 0xFF000000U) | 0x0077F8C5U; // vzeroupper
|
|
}
|
|
|
|
# ifdef XMRIG_FIX_RYZEN
|
|
xmrig::RxFix::setMainLoopBounds(mainLoopBounds);
|
|
# endif
|
|
|
|
imul_rcp_storage = code + (ADDR(randomx_program_imul_rcp_store) - codePrologue) + 2;
|
|
imul_rcp_storage_used = 0;
|
|
|
|
memcpy(imul_rcp_storage - 34, &pcfg.eMask, sizeof(pcfg.eMask));
|
|
codePos = codePosFirst;
|
|
prevCFROUND = -1;
|
|
prevFPOperation = -1;
|
|
|
|
//mark all registers as used
|
|
uint64_t* r = (uint64_t*)registerUsage;
|
|
uint64_t k = codePos;
|
|
k |= k << 32;
|
|
for (unsigned j = 0; j < RegistersCount / 2; ++j) {
|
|
r[j] = k;
|
|
}
|
|
|
|
for (int i = 0, n = static_cast<int>(RandomX_CurrentConfig.ProgramSize); i < n; i += 4) {
|
|
Instruction& instr1 = prog(i);
|
|
Instruction& instr2 = prog(i + 1);
|
|
Instruction& instr3 = prog(i + 2);
|
|
Instruction& instr4 = prog(i + 3);
|
|
|
|
InstructionGeneratorX86 gen1 = engine[instr1.opcode];
|
|
InstructionGeneratorX86 gen2 = engine[instr2.opcode];
|
|
InstructionGeneratorX86 gen3 = engine[instr3.opcode];
|
|
InstructionGeneratorX86 gen4 = engine[instr4.opcode];
|
|
|
|
(*gen1)(this, instr1);
|
|
(*gen2)(this, instr2);
|
|
(*gen3)(this, instr3);
|
|
(*gen4)(this, instr4);
|
|
}
|
|
|
|
*(uint64_t*)(code + codePos) = 0xc03341c08b41ull + (static_cast<uint64_t>(pcfg.readReg2) << 16) + (static_cast<uint64_t>(pcfg.readReg3) << 40);
|
|
codePos += 6;
|
|
}
|
|
|
|
void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) {
|
|
*(uint64_t*)(code + codePos) = 0xc03349c08b49ull + (static_cast<uint64_t>(pcfg.readReg0) << 16) + (static_cast<uint64_t>(pcfg.readReg1) << 40);
|
|
codePos += 6;
|
|
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, RandomX_CurrentConfig.codePrefetchScratchpadTweakedSize, code, codePos);
|
|
memcpy(code + codePos, codeLoopStore, loopStoreSize);
|
|
codePos += loopStoreSize;
|
|
|
|
if (BranchesWithin32B) {
|
|
const uint32_t branch_begin = static_cast<uint32_t>(codePos);
|
|
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + 9);
|
|
|
|
// If the jump crosses or touches 32-byte boundary, align it
|
|
if ((branch_begin ^ branch_end) >= 32) {
|
|
uint32_t alignment_size = 32 - (branch_begin & 31);
|
|
if (alignment_size > 8) {
|
|
emit(NOPX[alignment_size - 9], alignment_size - 8, code, codePos);
|
|
alignment_size = 8;
|
|
}
|
|
emit(NOPX[alignment_size - 1], alignment_size, code, codePos);
|
|
}
|
|
}
|
|
|
|
*(uint64_t*)(code + codePos) = 0x850f01eb83ull;
|
|
codePos += 5;
|
|
emit32(prologueSize - codePos - 4, code, codePos);
|
|
emitByte(0xe9, code, codePos);
|
|
emit32(epilogueOffset - codePos - 4, code, codePos);
|
|
}
|
|
|
|
template<bool AVX2>
|
|
FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) {
|
|
switch ((SuperscalarInstructionType)instr.opcode)
|
|
{
|
|
case randomx::SuperscalarInstructionType::ISUB_R:
|
|
*(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16);
|
|
codePos += 3;
|
|
if (AVX2) {
|
|
emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IXOR_R:
|
|
*(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16);
|
|
codePos += 3;
|
|
if (AVX2) {
|
|
emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IADD_RS:
|
|
emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos);
|
|
if (AVX2) {
|
|
if (instr.getModShift()) {
|
|
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 };
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
p[3] += instr.src;
|
|
p[4] = instr.getModShift();
|
|
p[8] += instr.dst * 9;
|
|
}
|
|
else {
|
|
emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
|
|
}
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IMUL_R:
|
|
emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos);
|
|
if (AVX2) {
|
|
static const uint8_t t[] = {
|
|
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
|
0xC5, 0xB5, 0x73, 0xD0, 0x20,
|
|
0xC5, 0x7D, 0xF4, 0xD0,
|
|
0xC5, 0x35, 0xF4, 0xD8,
|
|
0xC5, 0xBD, 0xF4, 0xC0,
|
|
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
|
|
0xC5, 0xFD, 0x73, 0xF0, 0x20,
|
|
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
|
|
0xC5, 0xAD, 0xD4, 0xC0
|
|
};
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
p[3] += instr.dst;
|
|
p[8] += instr.src;
|
|
p[11] -= instr.dst * 8;
|
|
p[13] += instr.src;
|
|
p[17] += instr.dst;
|
|
p[21] += instr.dst * 8 + instr.src;
|
|
p[29] -= instr.dst * 8;
|
|
p[31] += instr.dst;
|
|
p[41] += instr.dst * 9;
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IROR_C:
|
|
{
|
|
const uint32_t shift = instr.getImm32() & 63;
|
|
emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos);
|
|
if (AVX2) {
|
|
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 };
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
p[3] += instr.dst;
|
|
p[4] = shift;
|
|
p[8] += instr.dst;
|
|
p[9] = 64 - shift;
|
|
p[14] += instr.dst * 8;
|
|
}
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IADD_C7:
|
|
case randomx::SuperscalarInstructionType::IADD_C8:
|
|
case randomx::SuperscalarInstructionType::IADD_C9:
|
|
if (AVX2) {
|
|
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 };
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
|
|
p[12] += instr.dst * 8;
|
|
p[24] -= instr.dst * 8;
|
|
p[26] += instr.dst * 8;
|
|
}
|
|
else {
|
|
*(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16);
|
|
codePos += 3;
|
|
emit32(instr.getImm32(), code, codePos);
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IXOR_C7:
|
|
case randomx::SuperscalarInstructionType::IXOR_C8:
|
|
case randomx::SuperscalarInstructionType::IXOR_C9:
|
|
if (AVX2) {
|
|
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 };
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
|
|
p[12] += instr.dst * 8;
|
|
p[24] -= instr.dst * 8;
|
|
p[26] += instr.dst * 8;
|
|
}
|
|
else {
|
|
*(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16);
|
|
codePos += 3;
|
|
emit32(instr.getImm32(), code, codePos);
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IMULH_R:
|
|
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
|
|
codePos += 3;
|
|
*(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16);
|
|
codePos += 3;
|
|
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
|
|
codePos += 3;
|
|
if (AVX2) {
|
|
static const uint8_t t[] = {
|
|
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
|
0xC5, 0xB5, 0x73, 0xD0, 0x20,
|
|
0xC5, 0x7D, 0xF4, 0xD0,
|
|
0xC5, 0x3D, 0xF4, 0xD8,
|
|
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
|
|
0xC4, 0xC1, 0x3D, 0xF4, 0xC1,
|
|
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
|
|
0xC4, 0x41, 0x25, 0xEF, 0xC6,
|
|
0xC4, 0x41, 0x25, 0xD4, 0xDC,
|
|
0xC4, 0x41, 0x25, 0xD4, 0xDA,
|
|
0xC4, 0x41, 0x25, 0xEF, 0xCE,
|
|
0xC4, 0x42, 0x3D, 0x37, 0xC1,
|
|
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
|
|
0xC5, 0xBD, 0xD4, 0xC0,
|
|
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
|
|
0xC5, 0xA5, 0xD4, 0xC0
|
|
};
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
p[3] += instr.dst;
|
|
p[8] += instr.src;
|
|
p[11] -= instr.dst * 8;
|
|
p[13] += instr.src;
|
|
p[17] += instr.src;
|
|
p[20] -= instr.dst * 8;
|
|
p[27] += instr.dst * 8;
|
|
p[67] += instr.dst * 9;
|
|
p[77] += instr.dst * 9;
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::ISMULH_R:
|
|
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
|
|
codePos += 3;
|
|
*(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16);
|
|
codePos += 3;
|
|
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
|
|
codePos += 3;
|
|
if (AVX2) {
|
|
static const uint8_t t[] = {
|
|
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
|
0xC5, 0xB5, 0x73, 0xD0, 0x20,
|
|
0xC5, 0x7D, 0xF4, 0xD0,
|
|
0xC5, 0x3D, 0xF4, 0xD8,
|
|
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
|
|
0xC4, 0x41, 0x3D, 0xF4, 0xE9,
|
|
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
|
|
0xC4, 0x41, 0x25, 0xEF, 0xC6,
|
|
0xC4, 0x41, 0x25, 0xD4, 0xDC,
|
|
0xC4, 0x41, 0x25, 0xD4, 0xDA,
|
|
0xC4, 0x41, 0x25, 0xEF, 0xCE,
|
|
0xC4, 0x42, 0x3D, 0x37, 0xC1,
|
|
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
|
|
0xC4, 0x41, 0x15, 0xD4, 0xE8,
|
|
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
|
|
0xC4, 0x41, 0x15, 0xD4, 0xC3,
|
|
0xC4, 0x41, 0x35, 0xEF, 0xC9,
|
|
0xC4, 0x62, 0x35, 0x37, 0xD0,
|
|
0xC4, 0x62, 0x35, 0x37, 0xD8,
|
|
0xC5, 0x2D, 0xDB, 0xD0,
|
|
0xC5, 0x25, 0xDB, 0xD8,
|
|
0xC4, 0x41, 0x3D, 0xFB, 0xC2,
|
|
0xC4, 0xC1, 0x3D, 0xFB, 0xC3
|
|
};
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
p[3] += instr.dst;
|
|
p[8] += instr.src;
|
|
p[11] -= instr.dst * 8;
|
|
p[13] += instr.src;
|
|
p[17] += instr.src;
|
|
p[20] -= instr.dst * 8;
|
|
p[89] += instr.dst;
|
|
p[94] += instr.src;
|
|
p[98] += instr.src;
|
|
p[102] += instr.dst;
|
|
p[112] += instr.dst * 8;
|
|
}
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IMUL_RCP:
|
|
*(uint32_t*)(code + codePos) = 0x0000B848UL;
|
|
codePos += 2;
|
|
emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos);
|
|
emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos);
|
|
if (AVX2) {
|
|
static const uint8_t t[] = {
|
|
0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF,
|
|
0xC5, 0xBD, 0x73, 0xD0, 0x20,
|
|
0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20,
|
|
0xC4, 0x41, 0x7D, 0xF4, 0xD4,
|
|
0xC5, 0x35, 0xF4, 0xD8,
|
|
0xC4, 0xC1, 0x3D, 0xF4, 0xC4,
|
|
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
|
|
0xC5, 0xFD, 0x73, 0xF0, 0x20,
|
|
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
|
|
0xC5, 0xAD, 0xD4, 0xC0
|
|
};
|
|
uint8_t* p = code + codePos;
|
|
emit(t, code, codePos);
|
|
p[12] += instr.dst;
|
|
p[22] -= instr.dst * 8;
|
|
p[28] += instr.dst;
|
|
p[33] += instr.dst * 8;
|
|
p[41] -= instr.dst * 8;
|
|
p[43] += instr.dst;
|
|
p[53] += instr.dst * 9;
|
|
}
|
|
break;
|
|
default:
|
|
UNREACHABLE;
|
|
}
|
|
}
|
|
|
|
template void JitCompilerX86::generateSuperscalarCode<false>(Instruction&, uint8_t*, uint32_t&);
|
|
template void JitCompilerX86::generateSuperscalarCode<true>(Instruction&, uint8_t*, uint32_t&);
|
|
|
|
template<bool rax>
|
|
FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) {
|
|
*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16);
|
|
|
|
constexpr uint32_t add_table = 0x33333333u + (1u << (RegisterNeedsSib * 4));
|
|
codePos += (add_table >> (src * 4)) & 0xf;
|
|
|
|
emit32(instr.getImm32(), code, codePos);
|
|
if (rax) {
|
|
emitByte(0x25, code, codePos);
|
|
}
|
|
else {
|
|
*(uint32_t*)(code + codePos) = 0xe181;
|
|
codePos += 2;
|
|
}
|
|
emit32(AddressMask[instr.getModMem()], code, codePos);
|
|
}
|
|
|
|
template void JitCompilerX86::genAddressReg<false>(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos);
|
|
template void JitCompilerX86::genAddressReg<true>(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos);
|
|
|
|
FORCE_INLINE void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, uint32_t& codePos) {
|
|
const uint32_t dst = static_cast<uint32_t>(instr.dst) << 16;
|
|
*(uint32_t*)(code + codePos) = 0x24808d41 + dst;
|
|
codePos += (dst == (RegisterNeedsSib << 16)) ? 4 : 3;
|
|
|
|
emit32(instr.getImm32(), code, codePos);
|
|
emitByte(0x25, code, codePos);
|
|
|
|
const uint32_t mask1 = AddressMask[instr.getModMem()];
|
|
const uint32_t mask2 = ScratchpadL3Mask;
|
|
emit32((instr.mod < (StoreL3Condition << 4)) ? mask1 : mask2, code, codePos);
|
|
}
|
|
|
|
FORCE_INLINE void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, uint32_t& codePos) {
|
|
emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos);
|
|
}
|
|
|
|
void JitCompilerX86::h_IADD_RS(const Instruction& instr) {
|
|
uint32_t pos = codePos;
|
|
uint8_t* const p = code + pos;
|
|
|
|
const uint32_t dst = instr.dst;
|
|
const uint32_t sib = (instr.getModShift() << 6) | (instr.src << 3) | dst;
|
|
|
|
uint32_t k = 0x048d4f + (dst << 19);
|
|
if (dst == RegisterNeedsDisplacement)
|
|
k = 0xac8d4f;
|
|
|
|
*(uint32_t*)(p) = k | (sib << 24);
|
|
*(uint32_t*)(p + 4) = instr.getImm32();
|
|
|
|
pos += ((dst == RegisterNeedsDisplacement) ? 8 : 4);
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IADD_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
genAddressReg<true>(instr, src, p, pos);
|
|
emit32(0x0604034c + (dst << 19), p, pos);
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0x86034c + (dst << 19);
|
|
pos += 3;
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISUB_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
*(uint32_t*)(p + pos) = 0xc02b4d + (dst << 19) + (src << 16);
|
|
pos += 3;
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0xe88149 + (dst << 16);
|
|
pos += 3;
|
|
emit32(instr.getImm32(), p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISUB_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
genAddressReg<true>(instr, src, p, pos);
|
|
emit32(0x06042b4c + (dst << 19), p, pos);
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0x862b4c + (dst << 19);
|
|
pos += 3;
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMUL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
emit32(0xc0af0f4d + ((dst * 8 + src) << 24), p, pos);
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0xc0694d + (((dst << 3) + dst) << 16);
|
|
pos += 3;
|
|
emit32(instr.getImm32(), p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMUL_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
genAddressReg<true>(instr, src, p, pos);
|
|
*(uint64_t*)(p + pos) = 0x0604af0f4cull + (dst << 27);
|
|
pos += 5;
|
|
}
|
|
else {
|
|
emit32(0x86af0f4c + (dst << 27), p, pos);
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMULH_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst;
|
|
|
|
*(uint32_t*)(p + pos) = 0xc08b49 + (dst << 16);
|
|
*(uint32_t*)(p + pos + 3) = 0xe0f749 + (src << 16);
|
|
*(uint32_t*)(p + pos + 6) = 0xc28b4c + (dst << 19);
|
|
pos += 9;
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMULH_R_BMI2(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst;
|
|
|
|
*(uint32_t*)(p + pos) = 0xC4D08B49 + (dst << 16);
|
|
*(uint32_t*)(p + pos + 4) = 0xC0F6FB42 + (dst << 27) + (src << 24);
|
|
pos += 8;
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMULH_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
genAddressReg<false>(instr, src, p, pos);
|
|
*(uint64_t*)(p + pos) = 0x0e24f748c08b49ull + (dst << 16);
|
|
pos += 7;
|
|
}
|
|
else {
|
|
*(uint64_t*)(p + pos) = 0xa6f748c08b49ull + (dst << 16);
|
|
pos += 6;
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
*(uint32_t*)(p + pos) = 0xc28b4c + (dst << 19);
|
|
pos += 3;
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMULH_M_BMI2(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
genAddressReg<false>(instr, src, p, pos);
|
|
*(uint32_t*)(p + pos) = static_cast<uint32_t>(0xC4D08B49 + (dst << 16));
|
|
*(uint64_t*)(p + pos + 4) = 0x0E04F6FB62ULL + (dst << 27);
|
|
pos += 9;
|
|
}
|
|
else {
|
|
*(uint64_t*)(p + pos) = 0x86F6FB62C4D08B49ULL + (dst << 16) + (dst << 59);
|
|
*(uint32_t*)(p + pos + 8) = instr.getImm32() & ScratchpadL3Mask;
|
|
pos += 12;
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISMULH_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
*(uint64_t*)(p + pos) = 0x8b4ce8f749c08b49ull + (dst << 16) + (src << 40);
|
|
pos += 8;
|
|
emitByte(0xc2 + 8 * dst, p, pos);
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISMULH_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
genAddressReg<false>(instr, src, p, pos);
|
|
*(uint64_t*)(p + pos) = 0x0e2cf748c08b49ull + (dst << 16);
|
|
pos += 7;
|
|
}
|
|
else {
|
|
*(uint64_t*)(p + pos) = 0xaef748c08b49ull + (dst << 16);
|
|
pos += 6;
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
*(uint32_t*)(p + pos) = 0xc28b4c + (dst << 19);
|
|
pos += 3;
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMUL_RCP(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
uint64_t divisor = instr.getImm32();
|
|
if (!isZeroOrPowerOf2(divisor)) {
|
|
const uint32_t dst = instr.dst;
|
|
|
|
const uint64_t reciprocal = randomx_reciprocal_fast(divisor);
|
|
if (imul_rcp_storage_used < 16) {
|
|
*(uint64_t*)(imul_rcp_storage) = reciprocal;
|
|
*(uint64_t*)(p + pos) = 0x2444AF0F4Cull + (dst << 27) + (static_cast<uint64_t>(248 - imul_rcp_storage_used * 8) << 40);
|
|
++imul_rcp_storage_used;
|
|
imul_rcp_storage += 11;
|
|
pos += 6;
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0xb848;
|
|
pos += 2;
|
|
|
|
emit64(reciprocal, p, pos);
|
|
|
|
emit32(0xc0af0f4c + (dst << 27), p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_INEG_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst;
|
|
*(uint32_t*)(p + pos) = 0xd8f749 + (dst << 16);
|
|
pos += 3;
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IXOR_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
*(uint32_t*)(p + pos) = 0xc0334d + (((dst << 3) + src) << 16);
|
|
pos += 3;
|
|
}
|
|
else {
|
|
const uint64_t imm = instr.getImm32();
|
|
*(uint64_t*)(p + pos) = (imm << 24) + 0xf08149 + (dst << 16);
|
|
pos += 7;
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IXOR_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
genAddressReg<true>(instr, src, p, pos);
|
|
emit32(0x0604334c + (dst << 19), p, pos);
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0x86334c + (dst << 19);
|
|
pos += 3;
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IROR_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
*(uint64_t*)(p + pos) = 0xc8d349c88b41ull + (src << 16) + (dst << 40);
|
|
pos += 6;
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0xc8c149 + (dst << 16);
|
|
pos += 3;
|
|
emitByte(instr.getImm32() & 63, p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IROL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t src = instr.src;
|
|
const uint64_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
*(uint64_t*)(p + pos) = 0xc0d349c88b41ull + (src << 16) + (dst << 40);
|
|
pos += 6;
|
|
}
|
|
else {
|
|
*(uint32_t*)(p + pos) = 0xc0c149 + (dst << 16);
|
|
pos += 3;
|
|
emitByte(instr.getImm32() & 63, p, pos);
|
|
}
|
|
|
|
registerUsage[dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISWAP_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst;
|
|
|
|
if (src != dst) {
|
|
*(uint32_t*)(p + pos) = 0xc0874d + (((dst << 3) + src) << 16);
|
|
pos += 3;
|
|
registerUsage[dst] = pos;
|
|
registerUsage[src] = pos;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSWAP_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint64_t dst = instr.dst;
|
|
|
|
*(uint64_t*)(p + pos) = 0x01c0c60f66ull + (((dst << 3) + dst) << 24);
|
|
pos += 5;
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FADD_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
prevFPOperation = pos;
|
|
|
|
const uint64_t dst = instr.dst % RegisterCountFlt;
|
|
const uint64_t src = instr.src % RegisterCountFlt;
|
|
|
|
*(uint64_t*)(p + pos) = 0xc0580f4166ull + (((dst << 3) + src) << 32);
|
|
pos += 5;
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FADD_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
prevFPOperation = pos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
|
|
genAddressReg<true>(instr, src, p, pos);
|
|
*(uint64_t*)(p + pos) = 0x41660624e60f44f3ull;
|
|
*(uint32_t*)(p + pos + 8) = 0xc4580f + (dst << 19);
|
|
pos += 11;
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSUB_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
prevFPOperation = pos;
|
|
|
|
const uint64_t dst = instr.dst % RegisterCountFlt;
|
|
const uint64_t src = instr.src % RegisterCountFlt;
|
|
|
|
*(uint64_t*)(p + pos) = 0xc05c0f4166ull + (((dst << 3) + src) << 32);
|
|
pos += 5;
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSUB_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
prevFPOperation = pos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
|
|
genAddressReg<true>(instr, src, p, pos);
|
|
*(uint64_t*)(p + pos) = 0x41660624e60f44f3ull;
|
|
*(uint32_t*)(p + pos + 8) = 0xc45c0f + (dst << 19);
|
|
pos += 11;
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSCAL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
|
|
emit32(0xc7570f41 + (dst << 27), p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FMUL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
prevFPOperation = pos;
|
|
|
|
const uint64_t dst = instr.dst % RegisterCountFlt;
|
|
const uint64_t src = instr.src % RegisterCountFlt;
|
|
|
|
*(uint64_t*)(p + pos) = 0xe0590f4166ull + (((dst << 3) + src) << 32);
|
|
pos += 5;
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FDIV_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
prevFPOperation = pos;
|
|
|
|
const uint32_t src = instr.src;
|
|
const uint64_t dst = instr.dst % RegisterCountFlt;
|
|
|
|
genAddressReg<true>(instr, src, p, pos);
|
|
|
|
*(uint64_t*)(p + pos) = 0x0624e60f44f3ull;
|
|
pos += 6;
|
|
if (hasXOP) {
|
|
*(uint64_t*)(p + pos) = 0xd0e6a218488full;
|
|
pos += 6;
|
|
}
|
|
else {
|
|
*(uint64_t*)(p + pos) = 0xe6560f45e5540f45ull;
|
|
pos += 8;
|
|
}
|
|
*(uint64_t*)(p + pos) = 0xe45e0f4166ull + (dst << 35);
|
|
pos += 5;
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSQRT_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
prevFPOperation = pos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
|
|
emit32(0xe4510f66 + (((dst << 3) + dst) << 24), p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_CFROUND(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int32_t t = prevCFROUND;
|
|
|
|
if (t > prevFPOperation) {
|
|
if (vm_flags & RANDOMX_FLAG_AMD) {
|
|
memcpy(p + t, NOP26, 26);
|
|
}
|
|
else {
|
|
memcpy(p + t, NOP14, 14);
|
|
}
|
|
}
|
|
|
|
uint32_t pos = codePos;
|
|
prevCFROUND = pos;
|
|
|
|
const uint32_t src = instr.src;
|
|
|
|
*(uint32_t*)(p + pos) = 0x00C08B49 + (src << 16);
|
|
const int rotate = (static_cast<int>(instr.getImm32() & 63) - 2) & 63;
|
|
*(uint32_t*)(p + pos + 3) = 0x00C8C148 + (rotate << 24);
|
|
|
|
if (vm_flags & RANDOMX_FLAG_AMD) {
|
|
*(uint64_t*)(p + pos + 7) = 0x742024443B0CE083ULL;
|
|
*(uint64_t*)(p + pos + 15) = 0x8900EB0414AE0F0AULL;
|
|
*(uint32_t*)(p + pos + 23) = 0x202444;
|
|
pos += 26;
|
|
}
|
|
else {
|
|
*(uint64_t*)(p + pos + 7) = 0x0414AE0F0CE083ULL;
|
|
pos += 14;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_CFROUND_BMI2(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int32_t t = prevCFROUND;
|
|
|
|
if (t > prevFPOperation) {
|
|
if (vm_flags & RANDOMX_FLAG_AMD) {
|
|
memcpy(p + t, NOP25, 25);
|
|
}
|
|
else {
|
|
memcpy(p + t, NOP13, 13);
|
|
}
|
|
}
|
|
|
|
uint32_t pos = codePos;
|
|
prevCFROUND = pos;
|
|
|
|
const uint64_t src = instr.src;
|
|
|
|
const uint64_t rotate = (static_cast<int>(instr.getImm32() & 63) - 2) & 63;
|
|
*(uint64_t*)(p + pos) = 0xC0F0FBC3C4ULL | (src << 32) | (rotate << 40);
|
|
|
|
if (vm_flags & RANDOMX_FLAG_AMD) {
|
|
*(uint64_t*)(p + pos + 6) = 0x742024443B0CE083ULL;
|
|
*(uint64_t*)(p + pos + 14) = 0x8900EB0414AE0F0AULL;
|
|
*(uint32_t*)(p + pos + 22) = 0x202444;
|
|
pos += 25;
|
|
}
|
|
else {
|
|
*(uint64_t*)(p + pos + 6) = 0x0414AE0F0CE083ULL;
|
|
pos += 13;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
template<bool jccErratum>
|
|
void JitCompilerX86::h_CBRANCH(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
const int reg = instr.dst;
|
|
int32_t jmp_offset = registerUsage[reg];
|
|
|
|
// if it jumps over the previous FP instruction that uses rounding, treat it as if FP instruction happened now
|
|
if (jmp_offset <= prevFPOperation) {
|
|
prevFPOperation = pos;
|
|
}
|
|
|
|
jmp_offset -= pos + 16;
|
|
|
|
if (jccErratum) {
|
|
const uint32_t branch_begin = static_cast<uint32_t>(pos + 7);
|
|
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + ((jmp_offset >= -128) ? 9 : 13));
|
|
|
|
// If the jump crosses or touches 32-byte boundary, align it
|
|
if ((branch_begin ^ branch_end) >= 32) {
|
|
const uint32_t alignment_size = 32 - (branch_begin & 31);
|
|
jmp_offset -= alignment_size;
|
|
emit(JMP_ALIGN_PREFIX[alignment_size], alignment_size, p, pos);
|
|
}
|
|
}
|
|
|
|
*(uint32_t*)(p + pos) = 0x00c08149 + (reg << 16);
|
|
const int shift = instr.getModCond();
|
|
const uint32_t or_mask = (1UL << RandomX_ConfigurationBase::JumpOffset) << shift;
|
|
const uint32_t and_mask = rotl32(~static_cast<uint32_t>(1UL << (RandomX_ConfigurationBase::JumpOffset - 1)), shift);
|
|
*(uint32_t*)(p + pos + 3) = (instr.getImm32() | or_mask) & and_mask;
|
|
*(uint32_t*)(p + pos + 7) = 0x00c0f749 + (reg << 16);
|
|
*(uint32_t*)(p + pos + 10) = RandomX_ConfigurationBase::ConditionMask_Calculated << shift;
|
|
pos += 14;
|
|
|
|
if (jmp_offset >= -128) {
|
|
*(uint32_t*)(p + pos) = 0x74 + (static_cast<uint32_t>(jmp_offset) << 8);
|
|
pos += 2;
|
|
}
|
|
else {
|
|
*(uint64_t*)(p + pos) = 0x840f + (static_cast<uint64_t>(jmp_offset - 4) << 16);
|
|
pos += 6;
|
|
}
|
|
|
|
//mark all registers as used
|
|
uint64_t* r = (uint64_t*) registerUsage;
|
|
uint64_t k = pos;
|
|
k |= k << 32;
|
|
for (unsigned j = 0; j < RegistersCount / 2; ++j) {
|
|
r[j] = k;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
template void JitCompilerX86::h_CBRANCH<false>(const Instruction&);
|
|
template void JitCompilerX86::h_CBRANCH<true>(const Instruction&);
|
|
|
|
void JitCompilerX86::h_ISTORE(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
uint32_t pos = codePos;
|
|
|
|
genAddressRegDst(instr, p, pos);
|
|
emit32(0x0604894c + (static_cast<uint32_t>(instr.src) << 19), p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_NOP(const Instruction& instr) {
|
|
emitByte(0x90, code, codePos);
|
|
}
|
|
|
|
alignas(64) InstructionGeneratorX86 JitCompilerX86::engine[257] = {};
|
|
|
|
}
|