
Intel JCC erratum fix and various other improvements, see more here: https://www.phoronix.com/scan.php?page=article&item=intel-jcc-microcode&num=1
1074 lines
34 KiB
C++
1074 lines
34 KiB
C++
/*
|
|
Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
|
|
|
All rights reserved.
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are met:
|
|
* Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
* Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
* Neither the name of the copyright holder nor the
|
|
names of its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
|
|
#include <stdexcept>
|
|
#include <cstring>
|
|
#include <climits>
|
|
#include <atomic>
|
|
#include "crypto/randomx/jit_compiler_x86.hpp"
|
|
#include "crypto/randomx/jit_compiler_x86_static.hpp"
|
|
#include "crypto/randomx/superscalar.hpp"
|
|
#include "crypto/randomx/program.hpp"
|
|
#include "crypto/randomx/reciprocal.h"
|
|
#include "crypto/randomx/virtual_memory.hpp"
|
|
|
|
#ifdef _MSC_VER
|
|
# include <intrin.h>
|
|
#else
|
|
# include <cpuid.h>
|
|
#endif
|
|
|
|
namespace randomx {
|
|
/*
|
|
|
|
REGISTER ALLOCATION:
|
|
|
|
; rax -> temporary
|
|
; rbx -> iteration counter "ic"
|
|
; rcx -> temporary
|
|
; rdx -> temporary
|
|
; rsi -> scratchpad pointer
|
|
; rdi -> dataset pointer
|
|
; rbp -> memory registers "ma" (high 32 bits), "mx" (low 32 bits)
|
|
; rsp -> stack pointer
|
|
; r8 -> "r0"
|
|
; r9 -> "r1"
|
|
; r10 -> "r2"
|
|
; r11 -> "r3"
|
|
; r12 -> "r4"
|
|
; r13 -> "r5"
|
|
; r14 -> "r6"
|
|
; r15 -> "r7"
|
|
; xmm0 -> "f0"
|
|
; xmm1 -> "f1"
|
|
; xmm2 -> "f2"
|
|
; xmm3 -> "f3"
|
|
; xmm4 -> "e0"
|
|
; xmm5 -> "e1"
|
|
; xmm6 -> "e2"
|
|
; xmm7 -> "e3"
|
|
; xmm8 -> "a0"
|
|
; xmm9 -> "a1"
|
|
; xmm10 -> "a2"
|
|
; xmm11 -> "a3"
|
|
; xmm12 -> temporary
|
|
; xmm13 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
|
|
; xmm14 -> E 'or' mask = 0x3*00000000******3*00000000******
|
|
; xmm15 -> scale mask = 0x81f000000000000081f0000000000000
|
|
|
|
*/
|
|
|
|
const uint8_t* codePrefetchScratchpad = (uint8_t*)&randomx_prefetch_scratchpad;
|
|
const uint8_t* codePrefetchScratchpadEnd = (uint8_t*)&randomx_prefetch_scratchpad_end;
|
|
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
|
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
|
|
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
|
|
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
|
|
const uint8_t* codeReadDataset = (uint8_t*)&randomx_program_read_dataset;
|
|
const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init;
|
|
const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin;
|
|
const uint8_t* codeDatasetInit = (uint8_t*)&randomx_dataset_init;
|
|
const uint8_t* codeLoopStore = (uint8_t*)&randomx_program_loop_store;
|
|
const uint8_t* codeLoopEnd = (uint8_t*)&randomx_program_loop_end;
|
|
const uint8_t* codeEpilogue = (uint8_t*)&randomx_program_epilogue;
|
|
const uint8_t* codeProgramEnd = (uint8_t*)&randomx_program_end;
|
|
const uint8_t* codeShhLoad = (uint8_t*)&randomx_sshash_load;
|
|
const uint8_t* codeShhPrefetch = (uint8_t*)&randomx_sshash_prefetch;
|
|
const uint8_t* codeShhEnd = (uint8_t*)&randomx_sshash_end;
|
|
const uint8_t* codeShhInit = (uint8_t*)&randomx_sshash_init;
|
|
|
|
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
|
|
const int32_t prologueSize = codeLoopBegin - codePrologue;
|
|
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
|
|
const int32_t readDatasetSize = codeReadDatasetLightSshInit - codeReadDataset;
|
|
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
|
|
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
|
|
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
|
|
const int32_t datasetInitSize = codeEpilogue - codeDatasetInit;
|
|
const int32_t epilogueSize = codeShhLoad - codeEpilogue;
|
|
const int32_t codeSshLoadSize = codeShhPrefetch - codeShhLoad;
|
|
const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch;
|
|
const int32_t codeSshInitSize = codeProgramEnd - codeShhInit;
|
|
|
|
const int32_t epilogueOffset = (CodeSize - epilogueSize) & ~63;
|
|
constexpr int32_t superScalarHashOffset = 32768;
|
|
|
|
static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 };
|
|
static const uint8_t REX_ADD_RM[] = { 0x4c, 0x03 };
|
|
static const uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
|
|
static const uint8_t REX_SUB_RM[] = { 0x4c, 0x2b };
|
|
static const uint8_t REX_MOV_RR[] = { 0x41, 0x8b };
|
|
static const uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
|
|
static const uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
|
|
static const uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
|
|
static const uint8_t REX_IMUL_RRI[] = { 0x4d, 0x69 };
|
|
static const uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
|
|
static const uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
|
|
static const uint8_t REX_MUL_M[] = { 0x48, 0xf7 };
|
|
static const uint8_t REX_81[] = { 0x49, 0x81 };
|
|
static const uint8_t AND_EAX_I = 0x25;
|
|
static const uint8_t MOV_EAX_I = 0xb8;
|
|
static const uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
|
|
static const uint8_t MOV_RCX_I[] = { 0x48, 0xb9 };
|
|
static const uint8_t REX_LEA[] = { 0x4f, 0x8d };
|
|
static const uint8_t REX_MUL_MEM[] = { 0x48, 0xf7, 0x24, 0x0e };
|
|
static const uint8_t REX_IMUL_MEM[] = { 0x48, 0xf7, 0x2c, 0x0e };
|
|
static const uint8_t REX_SHR_RAX[] = { 0x48, 0xc1, 0xe8 };
|
|
static const uint8_t RAX_ADD_SBB_1[] = { 0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00 };
|
|
static const uint8_t MUL_RCX[] = { 0x48, 0xf7, 0xe1 };
|
|
static const uint8_t REX_SHR_RDX[] = { 0x48, 0xc1, 0xea };
|
|
static const uint8_t REX_SH[] = { 0x49, 0xc1 };
|
|
static const uint8_t MOV_RCX_RAX_SAR_RCX_63[] = { 0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f };
|
|
static const uint8_t AND_ECX_I[] = { 0x81, 0xe1 };
|
|
static const uint8_t ADD_RAX_RCX[] = { 0x48, 0x01, 0xC8 };
|
|
static const uint8_t SAR_RAX_I8[] = { 0x48, 0xC1, 0xF8 };
|
|
static const uint8_t NEG_RAX[] = { 0x48, 0xF7, 0xD8 };
|
|
static const uint8_t ADD_R_RAX[] = { 0x4C, 0x03 };
|
|
static const uint8_t XOR_EAX_EAX[] = { 0x33, 0xC0 };
|
|
static const uint8_t ADD_RDX_R[] = { 0x4c, 0x01 };
|
|
static const uint8_t SUB_RDX_R[] = { 0x4c, 0x29 };
|
|
static const uint8_t SAR_RDX_I8[] = { 0x48, 0xC1, 0xFA };
|
|
static const uint8_t TEST_RDX_RDX[] = { 0x48, 0x85, 0xD2 };
|
|
static const uint8_t SETS_AL_ADD_RDX_RAX[] = { 0x0F, 0x98, 0xC0, 0x48, 0x03, 0xD0 };
|
|
static const uint8_t REX_NEG[] = { 0x49, 0xF7 };
|
|
static const uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
|
|
static const uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
|
|
static const uint8_t REX_XOR_RM[] = { 0x4c, 0x33 };
|
|
static const uint8_t REX_ROT_CL[] = { 0x49, 0xd3 };
|
|
static const uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
|
|
static const uint8_t SHUFPD[] = { 0x66, 0x0f, 0xc6 };
|
|
static const uint8_t REX_ADDPD[] = { 0x66, 0x41, 0x0f, 0x58 };
|
|
static const uint8_t REX_CVTDQ2PD_XMM12[] = { 0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06 };
|
|
static const uint8_t REX_SUBPD[] = { 0x66, 0x41, 0x0f, 0x5c };
|
|
static const uint8_t REX_XORPS[] = { 0x41, 0x0f, 0x57 };
|
|
static const uint8_t REX_MULPD[] = { 0x66, 0x41, 0x0f, 0x59 };
|
|
static const uint8_t REX_MAXPD[] = { 0x66, 0x41, 0x0f, 0x5f };
|
|
static const uint8_t REX_DIVPD[] = { 0x66, 0x41, 0x0f, 0x5e };
|
|
static const uint8_t SQRTPD[] = { 0x66, 0x0f, 0x51 };
|
|
static const uint8_t AND_OR_MOV_LDMXCSR[] = { 0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x89, 0x44, 0x24, 0xFC, 0x0F, 0xAE, 0x54, 0x24, 0xFC };
|
|
static const uint8_t ROL_RAX[] = { 0x48, 0xc1, 0xc0 };
|
|
static const uint8_t XOR_ECX_ECX[] = { 0x33, 0xC9 };
|
|
static const uint8_t REX_CMP_R32I[] = { 0x41, 0x81 };
|
|
static const uint8_t REX_CMP_M32I[] = { 0x81, 0x3c, 0x06 };
|
|
static const uint8_t MOVAPD[] = { 0x66, 0x0f, 0x29 };
|
|
static const uint8_t REX_MOV_MR[] = { 0x4c, 0x89 };
|
|
static const uint8_t REX_XOR_EAX[] = { 0x41, 0x33 };
|
|
static const uint8_t SUB_EBX[] = { 0x83, 0xEB, 0x01 };
|
|
static const uint8_t JNZ[] = { 0x0f, 0x85 };
|
|
static const uint8_t JMP = 0xe9;
|
|
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
|
|
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
|
|
static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
|
|
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
|
|
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
|
|
static const uint8_t CALL = 0xe8;
|
|
static const uint8_t REX_ADD_I[] = { 0x49, 0x81 };
|
|
static const uint8_t REX_TEST[] = { 0x49, 0xF7 };
|
|
static const uint8_t JZ[] = { 0x0f, 0x84 };
|
|
static const uint8_t JZ_SHORT = 0x74;
|
|
static const uint8_t RET = 0xc3;
|
|
static const uint8_t LEA_32[] = { 0x41, 0x8d };
|
|
static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 };
|
|
static const uint8_t ADD_EBX_I[] = { 0x81, 0xc3 };
|
|
|
|
static const uint8_t NOP1[] = { 0x90 };
|
|
static const uint8_t NOP2[] = { 0x66, 0x90 };
|
|
static const uint8_t NOP3[] = { 0x66, 0x66, 0x90 };
|
|
static const uint8_t NOP4[] = { 0x0F, 0x1F, 0x40, 0x00 };
|
|
static const uint8_t NOP5[] = { 0x0F, 0x1F, 0x44, 0x00, 0x00 };
|
|
static const uint8_t NOP6[] = { 0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00 };
|
|
static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 };
|
|
static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };
|
|
|
|
static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
|
|
|
|
static const uint8_t JMP_ALIGN_PREFIX[14][16] = {
|
|
{},
|
|
{0x2E},
|
|
{0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
|
|
};
|
|
|
|
bool JitCompilerX86::BranchesWithin32B = false;
|
|
|
|
size_t JitCompilerX86::getCodeSize() {
|
|
return codePos < prologueSize ? 0 : codePos - prologueSize;
|
|
}
|
|
|
|
static inline void cpuid(uint32_t level, int32_t output[4])
|
|
{
|
|
memset(output, 0, sizeof(int32_t) * 4);
|
|
|
|
# ifdef _MSC_VER
|
|
__cpuid(output, static_cast<int>(level));
|
|
# else
|
|
__cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
|
|
# endif
|
|
}
|
|
|
|
// CPU-specific tweaks
|
|
void JitCompilerX86::applyTweaks() {
|
|
int32_t info[4];
|
|
cpuid(0, info);
|
|
|
|
int32_t manufacturer[4];
|
|
manufacturer[0] = info[1];
|
|
manufacturer[1] = info[3];
|
|
manufacturer[2] = info[2];
|
|
manufacturer[3] = 0;
|
|
|
|
if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
|
|
struct
|
|
{
|
|
unsigned int stepping : 4;
|
|
unsigned int model : 4;
|
|
unsigned int family : 4;
|
|
unsigned int processor_type : 2;
|
|
unsigned int reserved1 : 2;
|
|
unsigned int ext_model : 4;
|
|
unsigned int ext_family : 8;
|
|
unsigned int reserved2 : 4;
|
|
} processor_info;
|
|
|
|
cpuid(1, info);
|
|
memcpy(&processor_info, info, sizeof(processor_info));
|
|
|
|
// Intel JCC erratum mitigation
|
|
if (processor_info.family == 6) {
|
|
const uint32_t model = processor_info.model | (processor_info.ext_model << 4);
|
|
const uint32_t stepping = processor_info.stepping;
|
|
|
|
// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
|
|
BranchesWithin32B =
|
|
((model == 0x4E) && (stepping == 0x3)) ||
|
|
((model == 0x55) && (stepping == 0x4)) ||
|
|
((model == 0x5E) && (stepping == 0x3)) ||
|
|
((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
|
|
((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
|
|
((model == 0xA6) && (stepping == 0x0)) ||
|
|
((model == 0xAE) && (stepping == 0xA));
|
|
}
|
|
}
|
|
}
|
|
|
|
static std::atomic<size_t> codeOffset;
|
|
|
|
JitCompilerX86::JitCompilerX86() {
|
|
applyTweaks();
|
|
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
|
|
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
|
|
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
|
|
memcpy(code, codePrologue, prologueSize);
|
|
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
|
|
}
|
|
|
|
JitCompilerX86::~JitCompilerX86() {
|
|
freePagedMemory(allocatedCode, CodeSize);
|
|
}
|
|
|
|
void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
|
|
generateProgramPrologue(prog, pcfg);
|
|
memcpy(code + codePos, RandomX_CurrentConfig.codeReadDatasetTweaked, readDatasetSize);
|
|
codePos += readDatasetSize;
|
|
generateProgramEpilogue(prog, pcfg);
|
|
}
|
|
|
|
void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
|
|
generateProgramPrologue(prog, pcfg);
|
|
emit(RandomX_CurrentConfig.codeReadDatasetLightSshInitTweaked, readDatasetLightInitSize, code, codePos);
|
|
emit(ADD_EBX_I, code, codePos);
|
|
emit32(datasetOffset / CacheLineSize, code, codePos);
|
|
emitByte(CALL, code, codePos);
|
|
emit32(superScalarHashOffset - (codePos + 4), code, codePos);
|
|
emit(codeReadDatasetLightSshFin, readDatasetLightFinSize, code, codePos);
|
|
generateProgramEpilogue(prog, pcfg);
|
|
}
|
|
|
|
template<size_t N>
|
|
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N], std::vector<uint64_t> &reciprocalCache) {
|
|
memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
|
|
codePos = superScalarHashOffset + codeSshInitSize;
|
|
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
|
|
SuperscalarProgram& prog = programs[j];
|
|
for (unsigned i = 0; i < prog.getSize(); ++i) {
|
|
Instruction& instr = prog(i);
|
|
generateSuperscalarCode(instr, reciprocalCache);
|
|
}
|
|
emit(codeShhLoad, codeSshLoadSize, code, codePos);
|
|
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
|
|
emit(REX_MOV_RR64, code, codePos);
|
|
emitByte(0xd8 + prog.getAddressRegister(), code, codePos);
|
|
emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
|
|
#ifdef RANDOMX_ALIGN
|
|
int align = (codePos % 16);
|
|
while (align != 0) {
|
|
int nopSize = 16 - align;
|
|
if (nopSize > 8) nopSize = 8;
|
|
emit(NOPX[nopSize - 1], nopSize, code, codePos);
|
|
align = (codePos % 16);
|
|
}
|
|
#endif
|
|
}
|
|
}
|
|
emitByte(RET, code, codePos);
|
|
}
|
|
|
|
template
|
|
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES], std::vector<uint64_t> &reciprocalCache);
|
|
|
|
void JitCompilerX86::generateDatasetInitCode() {
|
|
memcpy(code, codeDatasetInit, datasetInitSize);
|
|
}
|
|
|
|
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
|
|
codePos = ((uint8_t*)randomx_program_prologue_first_load) - ((uint8_t*)randomx_program_prologue);
|
|
code[codePos + 2] = 0xc0 + pcfg.readReg0;
|
|
code[codePos + 5] = 0xc0 + pcfg.readReg1;
|
|
*(uint32_t*)(code + codePos + 10) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
|
*(uint32_t*)(code + codePos + 20) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
|
|
|
codePos = prologueSize;
|
|
memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask));
|
|
memcpy(code + codePos, codeLoopLoad, loopLoadSize);
|
|
codePos += loopLoadSize;
|
|
|
|
//mark all registers as used
|
|
uint64_t* r = (uint64_t*)registerUsage;
|
|
uint64_t k = codePos;
|
|
k |= k << 32;
|
|
for (unsigned j = 0; j < RegistersCount / 2; ++j) {
|
|
r[j] = k;
|
|
}
|
|
|
|
for (int i = 0, n = static_cast<int>(RandomX_CurrentConfig.ProgramSize); i < n; ++i) {
|
|
Instruction instr = prog(i);
|
|
*((uint64_t*)&instr) &= (uint64_t(-1) - (0xFFFF << 8)) | ((RegistersCount - 1) << 8) | ((RegistersCount - 1) << 16);
|
|
(this->*(engine[instr.opcode]))(instr);
|
|
}
|
|
|
|
emit(REX_MOV_RR, code, codePos);
|
|
emitByte(0xc0 + pcfg.readReg2, code, codePos);
|
|
emit(REX_XOR_EAX, code, codePos);
|
|
emitByte(0xc0 + pcfg.readReg3, code, codePos);
|
|
}
|
|
|
|
void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) {
|
|
emit(REX_MOV_RR64, code, codePos);
|
|
emitByte(0xc0 + pcfg.readReg0, code, codePos);
|
|
emit(REX_XOR_RAX_R64, code, codePos);
|
|
emitByte(0xc0 + pcfg.readReg1, code, codePos);
|
|
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos);
|
|
memcpy(code + codePos, codeLoopStore, loopStoreSize);
|
|
codePos += loopStoreSize;
|
|
|
|
if (BranchesWithin32B) {
|
|
const uint32_t branch_begin = static_cast<uint32_t>(codePos);
|
|
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + 9);
|
|
|
|
// If the jump crosses or touches 32-byte boundary, align it
|
|
if ((branch_begin ^ branch_end) >= 32) {
|
|
uint32_t alignment_size = 32 - (branch_begin & 31);
|
|
if (alignment_size > 8) {
|
|
emit(NOPX[alignment_size - 9], alignment_size - 8, code, codePos);
|
|
alignment_size = 8;
|
|
}
|
|
emit(NOPX[alignment_size - 1], alignment_size, code, codePos);
|
|
}
|
|
}
|
|
|
|
emit(SUB_EBX, code, codePos);
|
|
emit(JNZ, code, codePos);
|
|
emit32(prologueSize - codePos - 4, code, codePos);
|
|
emitByte(JMP, code, codePos);
|
|
emit32(epilogueOffset - codePos - 4, code, codePos);
|
|
}
|
|
|
|
void JitCompilerX86::generateSuperscalarCode(Instruction& instr, std::vector<uint64_t> &reciprocalCache) {
|
|
switch ((SuperscalarInstructionType)instr.opcode)
|
|
{
|
|
case randomx::SuperscalarInstructionType::ISUB_R:
|
|
emit(REX_SUB_RR, code, codePos);
|
|
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IXOR_R:
|
|
emit(REX_XOR_RR, code, codePos);
|
|
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IADD_RS:
|
|
emit(REX_LEA, code, codePos);
|
|
emitByte(0x04 + 8 * instr.dst, code, codePos);
|
|
genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IMUL_R:
|
|
emit(REX_IMUL_RR, code, codePos);
|
|
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IROR_C:
|
|
emit(REX_ROT_I8, code, codePos);
|
|
emitByte(0xc8 + instr.dst, code, codePos);
|
|
emitByte(instr.getImm32() & 63, code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IADD_C7:
|
|
emit(REX_81, code, codePos);
|
|
emitByte(0xc0 + instr.dst, code, codePos);
|
|
emit32(instr.getImm32(), code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IXOR_C7:
|
|
emit(REX_XOR_RI, code, codePos);
|
|
emitByte(0xf0 + instr.dst, code, codePos);
|
|
emit32(instr.getImm32(), code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IADD_C8:
|
|
emit(REX_81, code, codePos);
|
|
emitByte(0xc0 + instr.dst, code, codePos);
|
|
emit32(instr.getImm32(), code, codePos);
|
|
#ifdef RANDOMX_ALIGN
|
|
emit(NOP1, code, codePos);
|
|
#endif
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IXOR_C8:
|
|
emit(REX_XOR_RI, code, codePos);
|
|
emitByte(0xf0 + instr.dst, code, codePos);
|
|
emit32(instr.getImm32(), code, codePos);
|
|
#ifdef RANDOMX_ALIGN
|
|
emit(NOP1, code, codePos);
|
|
#endif
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IADD_C9:
|
|
emit(REX_81, code, codePos);
|
|
emitByte(0xc0 + instr.dst, code, codePos);
|
|
emit32(instr.getImm32(), code, codePos);
|
|
#ifdef RANDOMX_ALIGN
|
|
emit(NOP2, code, codePos);
|
|
#endif
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IXOR_C9:
|
|
emit(REX_XOR_RI, code, codePos);
|
|
emitByte(0xf0 + instr.dst, code, codePos);
|
|
emit32(instr.getImm32(), code, codePos);
|
|
#ifdef RANDOMX_ALIGN
|
|
emit(NOP2, code, codePos);
|
|
#endif
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IMULH_R:
|
|
emit(REX_MOV_RR64, code, codePos);
|
|
emitByte(0xc0 + instr.dst, code, codePos);
|
|
emit(REX_MUL_R, code, codePos);
|
|
emitByte(0xe0 + instr.src, code, codePos);
|
|
emit(REX_MOV_R64R, code, codePos);
|
|
emitByte(0xc2 + 8 * instr.dst, code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::ISMULH_R:
|
|
emit(REX_MOV_RR64, code, codePos);
|
|
emitByte(0xc0 + instr.dst, code, codePos);
|
|
emit(REX_MUL_R, code, codePos);
|
|
emitByte(0xe8 + instr.src, code, codePos);
|
|
emit(REX_MOV_R64R, code, codePos);
|
|
emitByte(0xc2 + 8 * instr.dst, code, codePos);
|
|
break;
|
|
case randomx::SuperscalarInstructionType::IMUL_RCP:
|
|
emit(MOV_RAX_I, code, codePos);
|
|
emit64(reciprocalCache[instr.getImm32()], code, codePos);
|
|
emit(REX_IMUL_RM, code, codePos);
|
|
emitByte(0xc0 + 8 * instr.dst, code, codePos);
|
|
break;
|
|
default:
|
|
UNREACHABLE;
|
|
}
|
|
}
|
|
|
|
template<bool rax>
|
|
FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos) {
|
|
const uint32_t src = *((uint32_t*)&instr) & 0xFF0000;
|
|
|
|
*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + src;
|
|
codePos += (src == (RegisterNeedsSib << 16)) ? 4 : 3;
|
|
|
|
emit32(instr.getImm32(), code, codePos);
|
|
if (rax)
|
|
emitByte(AND_EAX_I, code, codePos);
|
|
else
|
|
emit(AND_ECX_I, code, codePos);
|
|
emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos);
|
|
}
|
|
|
|
template void JitCompilerX86::genAddressReg<false>(const Instruction& instr, uint8_t* code, int& codePos);
|
|
template void JitCompilerX86::genAddressReg<true>(const Instruction& instr, uint8_t* code, int& codePos);
|
|
|
|
FORCE_INLINE void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) {
|
|
const uint32_t dst = static_cast<uint32_t>(instr.dst) << 16;
|
|
*(uint32_t*)(code + codePos) = 0x24808d41 + dst;
|
|
codePos += (dst == (RegisterNeedsSib << 16)) ? 4 : 3;
|
|
|
|
emit32(instr.getImm32(), code, codePos);
|
|
emitByte(AND_EAX_I, code, codePos);
|
|
if (instr.getModCond() < StoreL3Condition) {
|
|
emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos);
|
|
}
|
|
else {
|
|
emit32(ScratchpadL3Mask, code, codePos);
|
|
}
|
|
}
|
|
|
|
FORCE_INLINE void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) {
|
|
emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos);
|
|
}
|
|
|
|
static const uint32_t template_IADD_RS[8] = {
|
|
0x048d4f,
|
|
0x0c8d4f,
|
|
0x148d4f,
|
|
0x1c8d4f,
|
|
0x248d4f,
|
|
0xac8d4f,
|
|
0x348d4f,
|
|
0x3c8d4f,
|
|
};
|
|
|
|
void JitCompilerX86::h_IADD_RS(const Instruction& instr) {
|
|
int pos = codePos;
|
|
uint8_t* const p = code + pos;
|
|
|
|
const uint32_t sib = (instr.getModShift() << 6) | (instr.src << 3) | instr.dst;
|
|
*(uint32_t*)(p) = template_IADD_RS[instr.dst] | (sib << 24);
|
|
*(uint32_t*)(p + 4) = instr.getImm32();
|
|
|
|
pos += ((instr.dst == RegisterNeedsDisplacement) ? 8 : 4);
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
static const uint32_t template_IADD_M[8] = {
|
|
0x0604034c,
|
|
0x060c034c,
|
|
0x0614034c,
|
|
0x061c034c,
|
|
0x0624034c,
|
|
0x062c034c,
|
|
0x0634034c,
|
|
0x063c034c,
|
|
};
|
|
|
|
void JitCompilerX86::h_IADD_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
genAddressReg<true>(instr, p, pos);
|
|
emit32(template_IADD_M[instr.dst], p, pos);
|
|
}
|
|
else {
|
|
emit(REX_ADD_RM, p, pos);
|
|
emitByte(0x86 + 8 * instr.dst, p, pos);
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, int& codePos) {
|
|
emitByte((scale << 6) | (index << 3) | base, code, codePos);
|
|
}
|
|
|
|
void JitCompilerX86::h_ISUB_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
emit(REX_SUB_RR, p, pos);
|
|
emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_81, p, pos);
|
|
emitByte(0xe8 + instr.dst, p, pos);
|
|
emit32(instr.getImm32(), p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISUB_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
genAddressReg<true>(instr, p, pos);
|
|
emit(REX_SUB_RM, p, pos);
|
|
emitByte(0x04 + 8 * instr.dst, p, pos);
|
|
emitByte(0x06, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_SUB_RM, p, pos);
|
|
emitByte(0x86 + 8 * instr.dst, p, pos);
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMUL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
emit(REX_IMUL_RR, p, pos);
|
|
emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_IMUL_RRI, p, pos);
|
|
emitByte(0xc0 + 9 * instr.dst, p, pos);
|
|
emit32(instr.getImm32(), p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMUL_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
genAddressReg<true>(instr, p, pos);
|
|
emit(REX_IMUL_RM, p, pos);
|
|
emitByte(0x04 + 8 * instr.dst, p, pos);
|
|
emitByte(0x06, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_IMUL_RM, p, pos);
|
|
emitByte(0x86 + 8 * instr.dst, p, pos);
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMULH_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
emit(REX_MOV_RR64, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
emit(REX_MUL_R, p, pos);
|
|
emitByte(0xe0 + instr.src, p, pos);
|
|
emit(REX_MOV_R64R, p, pos);
|
|
emitByte(0xc2 + 8 * instr.dst, p, pos);
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMULH_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
genAddressReg<false>(instr, p, pos);
|
|
emit(REX_MOV_RR64, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
emit(REX_MUL_MEM, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_MOV_RR64, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
emit(REX_MUL_M, p, pos);
|
|
emitByte(0xa6, p, pos);
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
emit(REX_MOV_R64R, p, pos);
|
|
emitByte(0xc2 + 8 * instr.dst, p, pos);
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISMULH_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
emit(REX_MOV_RR64, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
emit(REX_MUL_R, p, pos);
|
|
emitByte(0xe8 + instr.src, p, pos);
|
|
emit(REX_MOV_R64R, p, pos);
|
|
emitByte(0xc2 + 8 * instr.dst, p, pos);
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISMULH_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
genAddressReg<false>(instr, p, pos);
|
|
emit(REX_MOV_RR64, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
emit(REX_IMUL_MEM, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_MOV_RR64, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
emit(REX_MUL_M, p, pos);
|
|
emitByte(0xae, p, pos);
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
emit(REX_MOV_R64R, p, pos);
|
|
emitByte(0xc2 + 8 * instr.dst, p, pos);
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IMUL_RCP(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
uint64_t divisor = instr.getImm32();
|
|
if (!isZeroOrPowerOf2(divisor)) {
|
|
emit(MOV_RAX_I, p, pos);
|
|
emit64(randomx_reciprocal_fast(divisor), p, pos);
|
|
emit(REX_IMUL_RM, p, pos);
|
|
emitByte(0xc0 + 8 * instr.dst, p, pos);
|
|
registerUsage[instr.dst] = pos;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_INEG_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
emit(REX_NEG, p, pos);
|
|
emitByte(0xd8 + instr.dst, p, pos);
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IXOR_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
emit(REX_XOR_RR, p, pos);
|
|
emitByte(0xc0 + 8 * instr.dst + instr.src, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_XOR_RI, p, pos);
|
|
emitByte(0xf0 + instr.dst, p, pos);
|
|
emit32(instr.getImm32(), p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IXOR_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
genAddressReg<true>(instr, p, pos);
|
|
emit(REX_XOR_RM, p, pos);
|
|
emitByte(0x04 + 8 * instr.dst, p, pos);
|
|
emitByte(0x06, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_XOR_RM, p, pos);
|
|
emitByte(0x86 + 8 * instr.dst, p, pos);
|
|
genAddressImm(instr, p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IROR_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
emit(REX_MOV_RR, p, pos);
|
|
emitByte(0xc8 + instr.src, p, pos);
|
|
emit(REX_ROT_CL, p, pos);
|
|
emitByte(0xc8 + instr.dst, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_ROT_I8, p, pos);
|
|
emitByte(0xc8 + instr.dst, p, pos);
|
|
emitByte(instr.getImm32() & 63, p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_IROL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
emit(REX_MOV_RR, p, pos);
|
|
emitByte(0xc8 + instr.src, p, pos);
|
|
emit(REX_ROT_CL, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
}
|
|
else {
|
|
emit(REX_ROT_I8, p, pos);
|
|
emitByte(0xc0 + instr.dst, p, pos);
|
|
emitByte(instr.getImm32() & 63, p, pos);
|
|
}
|
|
|
|
registerUsage[instr.dst] = pos;
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISWAP_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
if (instr.src != instr.dst) {
|
|
emit(REX_XCHG, p, pos);
|
|
emitByte(0xc0 + instr.src + 8 * instr.dst, p, pos);
|
|
registerUsage[instr.dst] = pos;
|
|
registerUsage[instr.src] = pos;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSWAP_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
emit(SHUFPD, p, pos);
|
|
emitByte(0xc0 + 9 * instr.dst, p, pos);
|
|
emitByte(1, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FADD_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
const uint32_t src = instr.src % RegisterCountFlt;
|
|
emit(REX_ADDPD, p, pos);
|
|
emitByte(0xc0 + src + 8 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FADD_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
genAddressReg<true>(instr, p, pos);
|
|
emit(REX_CVTDQ2PD_XMM12, p, pos);
|
|
emit(REX_ADDPD, p, pos);
|
|
emitByte(0xc4 + 8 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSUB_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
const uint32_t src = instr.src % RegisterCountFlt;
|
|
emit(REX_SUBPD, p, pos);
|
|
emitByte(0xc0 + src + 8 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSUB_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
genAddressReg<true>(instr, p, pos);
|
|
emit(REX_CVTDQ2PD_XMM12, p, pos);
|
|
emit(REX_SUBPD, p, pos);
|
|
emitByte(0xc4 + 8 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSCAL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
emit(REX_XORPS, p, pos);
|
|
emitByte(0xc7 + 8 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FMUL_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
const uint32_t src = instr.src % RegisterCountFlt;
|
|
emit(REX_MULPD, p, pos);
|
|
emitByte(0xe0 + src + 8 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FDIV_M(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
genAddressReg<true>(instr, p, pos);
|
|
emit(REX_CVTDQ2PD_XMM12, p, pos);
|
|
emit(REX_ANDPS_XMM12, p, pos);
|
|
emit(REX_DIVPD, p, pos);
|
|
emitByte(0xe4 + 8 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_FSQRT_R(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const uint32_t dst = instr.dst % RegisterCountFlt;
|
|
emit(SQRTPD, p, pos);
|
|
emitByte(0xe4 + 9 * dst, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_CFROUND(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
emit(REX_MOV_RR64, p, pos);
|
|
emitByte(0xc0 + instr.src, p, pos);
|
|
int rotate = (13 - (instr.getImm32() & 63)) & 63;
|
|
if (rotate != 0) {
|
|
emit(ROL_RAX, p, pos);
|
|
emitByte(rotate, p, pos);
|
|
}
|
|
emit(AND_OR_MOV_LDMXCSR, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_CBRANCH(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
const int reg = instr.dst;
|
|
int32_t jmp_offset = registerUsage[reg] - (pos + 16);
|
|
|
|
if (BranchesWithin32B) {
|
|
const uint32_t branch_begin = static_cast<uint32_t>(pos + 7);
|
|
const uint32_t branch_end = static_cast<uint32_t>(branch_begin + ((jmp_offset >= -128) ? 9 : 13));
|
|
|
|
// If the jump crosses or touches 32-byte boundary, align it
|
|
if ((branch_begin ^ branch_end) >= 32) {
|
|
const uint32_t alignment_size = 32 - (branch_begin & 31);
|
|
jmp_offset -= alignment_size;
|
|
emit(JMP_ALIGN_PREFIX[alignment_size], alignment_size, p, pos);
|
|
}
|
|
}
|
|
|
|
emit(REX_ADD_I, p, pos);
|
|
emitByte(0xc0 + reg, p, pos);
|
|
const int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset;
|
|
const uint32_t imm = (instr.getImm32() | (1UL << shift)) & ~(1UL << (shift - 1));
|
|
emit32(imm, p, pos);
|
|
emit(REX_TEST, p, pos);
|
|
emitByte(0xc0 + reg, p, pos);
|
|
emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift, p, pos);
|
|
|
|
if (jmp_offset >= -128) {
|
|
emitByte(JZ_SHORT, p, pos);
|
|
emitByte(jmp_offset, p, pos);
|
|
}
|
|
else {
|
|
emit(JZ, p, pos);
|
|
emit32(jmp_offset - 4, p, pos);
|
|
}
|
|
|
|
//mark all registers as used
|
|
uint64_t* r = (uint64_t*) registerUsage;
|
|
uint64_t k = pos;
|
|
k |= k << 32;
|
|
for (unsigned j = 0; j < RegistersCount / 2; ++j) {
|
|
r[j] = k;
|
|
}
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_ISTORE(const Instruction& instr) {
|
|
uint8_t* const p = code;
|
|
int pos = codePos;
|
|
|
|
genAddressRegDst(instr, p, pos);
|
|
emit(REX_MOV_MR, p, pos);
|
|
emitByte(0x04 + 8 * instr.src, p, pos);
|
|
emitByte(0x06, p, pos);
|
|
|
|
codePos = pos;
|
|
}
|
|
|
|
void JitCompilerX86::h_NOP(const Instruction& instr) {
|
|
emit(NOP1, code, codePos);
|
|
}
|
|
|
|
InstructionGeneratorX86 JitCompilerX86::engine[256] = {};
|
|
|
|
}
|