This commit is contained in:
commit
baeb45e8a1
30 changed files with 267 additions and 88 deletions
|
@ -1,5 +1,5 @@
|
|||
;# save VM register values
|
||||
add rsp, 24
|
||||
add rsp, 40
|
||||
pop rcx
|
||||
mov qword ptr [rcx+0], r8
|
||||
mov qword ptr [rcx+8], r9
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
lea rcx, [rsi+rax]
|
||||
mov [rsp+8], rcx
|
||||
mov [rsp+16], rcx
|
||||
xor r8, qword ptr [rcx+0]
|
||||
xor r9, qword ptr [rcx+8]
|
||||
xor r10, qword ptr [rcx+16]
|
||||
|
@ -9,7 +9,7 @@
|
|||
xor r14, qword ptr [rcx+48]
|
||||
xor r15, qword ptr [rcx+56]
|
||||
lea rcx, [rsi+rdx]
|
||||
mov [rsp+16], rcx
|
||||
mov [rsp+24], rcx
|
||||
cvtdq2pd xmm0, qword ptr [rcx+0]
|
||||
cvtdq2pd xmm1, qword ptr [rcx+8]
|
||||
cvtdq2pd xmm2, qword ptr [rcx+16]
|
||||
|
@ -18,11 +18,11 @@
|
|||
cvtdq2pd xmm5, qword ptr [rcx+40]
|
||||
cvtdq2pd xmm6, qword ptr [rcx+48]
|
||||
cvtdq2pd xmm7, qword ptr [rcx+56]
|
||||
andps xmm4, xmm13
|
||||
andps xmm5, xmm13
|
||||
andps xmm6, xmm13
|
||||
andps xmm7, xmm13
|
||||
orps xmm4, xmm14
|
||||
orps xmm5, xmm14
|
||||
orps xmm6, xmm14
|
||||
orps xmm7, xmm14
|
||||
andpd xmm4, xmm13
|
||||
andpd xmm5, xmm13
|
||||
andpd xmm6, xmm13
|
||||
andpd xmm7, xmm13
|
||||
orpd xmm4, xmm14
|
||||
orpd xmm5, xmm14
|
||||
orpd xmm6, xmm14
|
||||
orpd xmm7, xmm14
|
||||
|
|
24
src/crypto/randomx/asm/program_loop_load_xop.inc
Normal file
24
src/crypto/randomx/asm/program_loop_load_xop.inc
Normal file
|
@ -0,0 +1,24 @@
|
|||
lea rcx, [rsi+rax]
|
||||
mov [rsp+16], rcx
|
||||
xor r8, qword ptr [rcx+0]
|
||||
xor r9, qword ptr [rcx+8]
|
||||
xor r10, qword ptr [rcx+16]
|
||||
xor r11, qword ptr [rcx+24]
|
||||
xor r12, qword ptr [rcx+32]
|
||||
xor r13, qword ptr [rcx+40]
|
||||
xor r14, qword ptr [rcx+48]
|
||||
xor r15, qword ptr [rcx+56]
|
||||
lea rcx, [rsi+rdx]
|
||||
mov [rsp+24], rcx
|
||||
cvtdq2pd xmm0, qword ptr [rcx+0]
|
||||
cvtdq2pd xmm1, qword ptr [rcx+8]
|
||||
cvtdq2pd xmm2, qword ptr [rcx+16]
|
||||
cvtdq2pd xmm3, qword ptr [rcx+24]
|
||||
cvtdq2pd xmm4, qword ptr [rcx+32]
|
||||
cvtdq2pd xmm5, qword ptr [rcx+40]
|
||||
cvtdq2pd xmm6, qword ptr [rcx+48]
|
||||
cvtdq2pd xmm7, qword ptr [rcx+56]
|
||||
vpcmov xmm4, xmm4, xmm14, xmm13
|
||||
vpcmov xmm5, xmm5, xmm14, xmm13
|
||||
vpcmov xmm6, xmm6, xmm14, xmm13
|
||||
vpcmov xmm7, xmm7, xmm14, xmm13
|
|
@ -1,4 +1,4 @@
|
|||
mov rcx, [rsp+16]
|
||||
mov rcx, [rsp+24]
|
||||
mov qword ptr [rcx+0], r8
|
||||
mov qword ptr [rcx+8], r9
|
||||
mov qword ptr [rcx+16], r10
|
||||
|
@ -7,7 +7,7 @@
|
|||
mov qword ptr [rcx+40], r13
|
||||
mov qword ptr [rcx+48], r14
|
||||
mov qword ptr [rcx+56], r15
|
||||
mov rcx, [rsp+8]
|
||||
mov rcx, [rsp+16]
|
||||
xorpd xmm0, xmm4
|
||||
xorpd xmm1, xmm5
|
||||
xorpd xmm2, xmm6
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
mantissaMask:
|
||||
db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0
|
||||
db 0, 0, 192, 255, 255, 255, 255, 0, 0, 0, 192, 255, 255, 255, 255, 0
|
||||
exp240:
|
||||
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
|
||||
scaleMask:
|
||||
|
|
|
@ -89,6 +89,7 @@ namespace randomx {
|
|||
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
|
||||
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
|
||||
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
|
||||
const uint8_t* codeLoopLoadXOP = (uint8_t*)&randomx_program_loop_load_xop;
|
||||
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
|
||||
const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init;
|
||||
const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin;
|
||||
|
@ -104,7 +105,8 @@ namespace randomx {
|
|||
|
||||
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
|
||||
const int32_t prologueSize = codeLoopBegin - codePrologue;
|
||||
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
|
||||
const int32_t loopLoadSize = codeLoopLoadXOP - codeLoopLoad;
|
||||
const int32_t loopLoadXOPSize = codeProgamStart - codeLoopLoadXOP;
|
||||
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
|
||||
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
|
||||
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
|
||||
|
@ -184,6 +186,7 @@ namespace randomx {
|
|||
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
|
||||
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
|
||||
static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
|
||||
static const uint8_t REX_VPCMOV_XMM12[] = { 0x8F, 0x48, 0x18, 0xA2, 0xE6, 0xD0 };
|
||||
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
|
||||
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
|
||||
static const uint8_t CALL = 0xe8;
|
||||
|
@ -295,11 +298,23 @@ namespace randomx {
|
|||
cpuid(1, info);
|
||||
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
|
||||
|
||||
cpuid(0x80000001, info);
|
||||
hasXOP = ((info[2] & (1 << 11)) != 0);
|
||||
|
||||
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
|
||||
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
|
||||
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
|
||||
memcpy(code, codePrologue, prologueSize);
|
||||
if (hasXOP) {
|
||||
memcpy(code + prologueSize, codeLoopLoadXOP, loopLoadXOPSize);
|
||||
}
|
||||
else {
|
||||
memcpy(code + prologueSize, codeLoopLoad, loopLoadSize);
|
||||
}
|
||||
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
|
||||
|
||||
codePosFirst = prologueSize + (hasXOP ? loopLoadXOPSize : loopLoadSize);
|
||||
|
||||
# ifdef XMRIG_FIX_RYZEN
|
||||
mainLoopBounds.first = code + prologueSize;
|
||||
mainLoopBounds.second = code + epilogueOffset;
|
||||
|
@ -317,7 +332,7 @@ namespace randomx {
|
|||
|
||||
uint8_t* p;
|
||||
uint32_t n;
|
||||
if (flags & RANDOMX_FLAG_RYZEN) {
|
||||
if (flags & RANDOMX_FLAG_AMD) {
|
||||
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
|
||||
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
|
||||
}
|
||||
|
@ -385,7 +400,7 @@ namespace randomx {
|
|||
*(uint32_t*)(code + codePos + 10) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
||||
*(uint32_t*)(code + codePos + 20) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
|
||||
if (hasAVX) {
|
||||
uint32_t* p = (uint32_t*)(code + codePos + 32);
|
||||
uint32_t* p = (uint32_t*)(code + codePos + 67);
|
||||
*p = (*p & 0xFF000000U) | 0x0077F8C5U;
|
||||
}
|
||||
|
||||
|
@ -393,10 +408,8 @@ namespace randomx {
|
|||
xmrig::Rx::setMainLoopBounds(mainLoopBounds);
|
||||
# endif
|
||||
|
||||
codePos = prologueSize;
|
||||
memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask));
|
||||
memcpy(code + codePos, codeLoopLoad, loopLoadSize);
|
||||
codePos += loopLoadSize;
|
||||
memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask));
|
||||
codePos = codePosFirst;
|
||||
|
||||
//mark all registers as used
|
||||
uint64_t* r = (uint64_t*)registerUsage;
|
||||
|
@ -708,14 +721,31 @@ namespace randomx {
|
|||
uint8_t* const p = code;
|
||||
int pos = codePos;
|
||||
|
||||
const uint32_t dst = instr.dst;
|
||||
|
||||
emit(REX_MOV_RR64, p, pos);
|
||||
emitByte(0xc0 + instr.dst, p, pos);
|
||||
emitByte(0xc0 + dst, p, pos);
|
||||
emit(REX_MUL_R, p, pos);
|
||||
emitByte(0xe0 + instr.src, p, pos);
|
||||
emit(REX_MOV_R64R, p, pos);
|
||||
emitByte(0xc2 + 8 * instr.dst, p, pos);
|
||||
emitByte(0xc2 + 8 * dst, p, pos);
|
||||
|
||||
registerUsage[instr.dst] = pos;
|
||||
registerUsage[dst] = pos;
|
||||
codePos = pos;
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_IMULH_R_BMI2(const Instruction& instr) {
|
||||
uint8_t* const p = code;
|
||||
int pos = codePos;
|
||||
|
||||
const uint32_t src = instr.src;
|
||||
const uint32_t dst = instr.dst;
|
||||
|
||||
*(uint32_t*)(p + pos) = 0xC4D08B49 + (dst << 16);
|
||||
*(uint32_t*)(p + pos + 4) = 0xC0F6FB42 + (dst << 27) + (src << 24);
|
||||
pos += 8;
|
||||
|
||||
registerUsage[dst] = pos;
|
||||
codePos = pos;
|
||||
}
|
||||
|
||||
|
@ -743,6 +773,29 @@ namespace randomx {
|
|||
codePos = pos;
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_IMULH_M_BMI2(const Instruction& instr) {
|
||||
uint8_t* const p = code;
|
||||
int pos = codePos;
|
||||
|
||||
const uint64_t src = instr.src;
|
||||
const uint64_t dst = instr.dst;
|
||||
|
||||
if (src != dst) {
|
||||
genAddressReg<false>(instr, p, pos);
|
||||
*(uint32_t*)(p + pos) = static_cast<uint32_t>(0xC4D08B49 + (dst << 16));
|
||||
*(uint64_t*)(p + pos + 4) = 0x0E04F6FB62ULL + (dst << 27);
|
||||
pos += 9;
|
||||
}
|
||||
else {
|
||||
*(uint64_t*)(p + pos) = 0x86F6FB62C4D08B49ULL + (dst << 16) + (dst << 59);
|
||||
*(uint32_t*)(p + pos + 8) = instr.getImm32() & ScratchpadL3Mask;
|
||||
pos += 12;
|
||||
}
|
||||
|
||||
registerUsage[dst] = pos;
|
||||
codePos = pos;
|
||||
}
|
||||
|
||||
void JitCompilerX86::h_ISMULH_R(const Instruction& instr) {
|
||||
uint8_t* const p = code;
|
||||
int pos = codePos;
|
||||
|
@ -992,7 +1045,12 @@ namespace randomx {
|
|||
const uint32_t dst = instr.dst % RegisterCountFlt;
|
||||
genAddressReg<true>(instr, p, pos);
|
||||
emit(REX_CVTDQ2PD_XMM12, p, pos);
|
||||
emit(REX_ANDPS_XMM12, p, pos);
|
||||
if (hasXOP) {
|
||||
emit(REX_VPCMOV_XMM12, p, pos);
|
||||
}
|
||||
else {
|
||||
emit(REX_ANDPS_XMM12, p, pos);
|
||||
}
|
||||
emit(REX_DIVPD, p, pos);
|
||||
emitByte(0xe4 + 8 * dst, p, pos);
|
||||
|
||||
|
@ -1014,18 +1072,21 @@ namespace randomx {
|
|||
uint8_t* const p = code;
|
||||
int pos = codePos;
|
||||
|
||||
emit(REX_MOV_RR64, p, pos);
|
||||
emitByte(0xc0 + instr.src, p, pos);
|
||||
int rotate = (13 - (instr.getImm32() & 63)) & 63;
|
||||
if (rotate != 0) {
|
||||
emit(ROL_RAX, p, pos);
|
||||
emitByte(rotate, p, pos);
|
||||
}
|
||||
if (vm_flags & RANDOMX_FLAG_RYZEN) {
|
||||
emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
|
||||
const uint32_t src = instr.src;
|
||||
|
||||
*(uint32_t*)(p + pos) = 0x00C08B49 + (src << 16);
|
||||
const int rotate = (static_cast<int>(instr.getImm32() & 63) - 2) & 63;
|
||||
*(uint32_t*)(p + pos + 3) = 0x00C8C148 + (rotate << 24);
|
||||
|
||||
if (vm_flags & RANDOMX_FLAG_AMD) {
|
||||
*(uint64_t*)(p + pos + 7) = 0x742024443B0CE083ULL;
|
||||
*(uint8_t*)(p + pos + 15) = 8;
|
||||
*(uint64_t*)(p + pos + 16) = 0x202444890414AE0FULL;
|
||||
pos += 24;
|
||||
}
|
||||
else {
|
||||
emit(AND_OR_MOV_LDMXCSR, p, pos);
|
||||
*(uint64_t*)(p + pos + 7) = 0x0414AE0F0CE083ULL;
|
||||
pos += 14;
|
||||
}
|
||||
|
||||
codePos = pos;
|
||||
|
|
|
@ -73,10 +73,12 @@ namespace randomx {
|
|||
std::pair<const void*, const void*> mainLoopBounds;
|
||||
# endif
|
||||
int32_t codePos;
|
||||
int32_t codePosFirst;
|
||||
uint32_t vm_flags;
|
||||
|
||||
static bool BranchesWithin32B;
|
||||
bool hasAVX;
|
||||
bool hasXOP;
|
||||
|
||||
static void applyTweaks();
|
||||
void generateProgramPrologue(Program&, ProgramConfiguration&);
|
||||
|
@ -121,7 +123,9 @@ namespace randomx {
|
|||
void h_IMUL_R(const Instruction&);
|
||||
void h_IMUL_M(const Instruction&);
|
||||
void h_IMULH_R(const Instruction&);
|
||||
void h_IMULH_R_BMI2(const Instruction&);
|
||||
void h_IMULH_M(const Instruction&);
|
||||
void h_IMULH_M_BMI2(const Instruction&);
|
||||
void h_ISMULH_R(const Instruction&);
|
||||
void h_ISMULH_M(const Instruction&);
|
||||
void h_IMUL_RCP(const Instruction&);
|
||||
|
|
|
@ -43,6 +43,7 @@
|
|||
.global DECL(randomx_program_prologue_first_load)
|
||||
.global DECL(randomx_program_loop_begin)
|
||||
.global DECL(randomx_program_loop_load)
|
||||
.global DECL(randomx_program_loop_load_xop)
|
||||
.global DECL(randomx_program_start)
|
||||
.global DECL(randomx_program_read_dataset)
|
||||
.global DECL(randomx_program_read_dataset_ryzen)
|
||||
|
@ -93,8 +94,12 @@ DECL(randomx_program_prologue_first_load):
|
|||
and eax, RANDOMX_SCRATCHPAD_MASK
|
||||
ror rdx, 32
|
||||
and edx, RANDOMX_SCRATCHPAD_MASK
|
||||
sub rsp, 24
|
||||
stmxcsr dword ptr [rsp]
|
||||
sub rsp, 40
|
||||
mov dword ptr [rsp], 0x9FC0
|
||||
mov dword ptr [rsp+4], 0xBFC0
|
||||
mov dword ptr [rsp+8], 0xDFC0
|
||||
mov dword ptr [rsp+12], 0xFFC0
|
||||
mov dword ptr [rsp+32], -1
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
@ -110,6 +115,9 @@ DECL(randomx_program_loop_begin):
|
|||
DECL(randomx_program_loop_load):
|
||||
#include "asm/program_loop_load.inc"
|
||||
|
||||
DECL(randomx_program_loop_load_xop):
|
||||
#include "asm/program_loop_load_xop.inc"
|
||||
|
||||
DECL(randomx_program_start):
|
||||
nop
|
||||
|
||||
|
|
|
@ -34,6 +34,7 @@ PUBLIC randomx_program_prologue
|
|||
PUBLIC randomx_program_prologue_first_load
|
||||
PUBLIC randomx_program_loop_begin
|
||||
PUBLIC randomx_program_loop_load
|
||||
PUBLIC randomx_program_loop_load_xop
|
||||
PUBLIC randomx_program_start
|
||||
PUBLIC randomx_program_read_dataset
|
||||
PUBLIC randomx_program_read_dataset_ryzen
|
||||
|
@ -81,8 +82,12 @@ randomx_program_prologue_first_load PROC
|
|||
and eax, RANDOMX_SCRATCHPAD_MASK
|
||||
ror rdx, 32
|
||||
and edx, RANDOMX_SCRATCHPAD_MASK
|
||||
sub rsp, 24
|
||||
stmxcsr dword ptr [rsp]
|
||||
sub rsp, 40
|
||||
mov dword ptr [rsp], 9FC0h
|
||||
mov dword ptr [rsp+4], 0BFC0h
|
||||
mov dword ptr [rsp+8], 0DFC0h
|
||||
mov dword ptr [rsp+12], 0FFC0h
|
||||
mov dword ptr [rsp+32], -1
|
||||
nop
|
||||
nop
|
||||
nop
|
||||
|
@ -101,6 +106,10 @@ randomx_program_loop_load PROC
|
|||
include asm/program_loop_load.inc
|
||||
randomx_program_loop_load ENDP
|
||||
|
||||
randomx_program_loop_load_xop PROC
|
||||
include asm/program_loop_load_xop.inc
|
||||
randomx_program_loop_load_xop ENDP
|
||||
|
||||
randomx_program_start PROC
|
||||
nop
|
||||
randomx_program_start ENDP
|
||||
|
|
|
@ -35,6 +35,7 @@ extern "C" {
|
|||
void randomx_program_prologue_first_load();
|
||||
void randomx_program_loop_begin();
|
||||
void randomx_program_loop_load();
|
||||
void randomx_program_loop_load_xop();
|
||||
void randomx_program_start();
|
||||
void randomx_program_read_dataset();
|
||||
void randomx_program_read_dataset_ryzen();
|
||||
|
|
|
@ -41,6 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "crypto/randomx/jit_compiler_a64_static.hpp"
|
||||
#endif
|
||||
|
||||
#include "backend/cpu/Cpu.h"
|
||||
|
||||
#include <cassert>
|
||||
|
||||
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
|
||||
|
@ -236,14 +238,29 @@ void RandomX_ConfigurationBase::Apply()
|
|||
CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \
|
||||
for (; k < CEIL_##x; ++k) { JIT_HANDLE(x, prev); }
|
||||
|
||||
#define INST_HANDLE2(x, func_name, prev) \
|
||||
CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \
|
||||
for (; k < CEIL_##x; ++k) { JIT_HANDLE(func_name, prev); }
|
||||
|
||||
INST_HANDLE(IADD_RS, NULL);
|
||||
INST_HANDLE(IADD_M, IADD_RS);
|
||||
INST_HANDLE(ISUB_R, IADD_M);
|
||||
INST_HANDLE(ISUB_M, ISUB_R);
|
||||
INST_HANDLE(IMUL_R, ISUB_M);
|
||||
INST_HANDLE(IMUL_M, IMUL_R);
|
||||
INST_HANDLE(IMULH_R, IMUL_M);
|
||||
INST_HANDLE(IMULH_M, IMULH_R);
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
if (xmrig::Cpu::info()->hasBMI2()) {
|
||||
INST_HANDLE2(IMULH_R, IMULH_R_BMI2, IMUL_M);
|
||||
INST_HANDLE2(IMULH_M, IMULH_M_BMI2, IMULH_R);
|
||||
}
|
||||
else
|
||||
#endif
|
||||
{
|
||||
INST_HANDLE(IMULH_R, IMUL_M);
|
||||
INST_HANDLE(IMULH_M, IMULH_R);
|
||||
}
|
||||
|
||||
INST_HANDLE(ISMULH_R, IMULH_M);
|
||||
INST_HANDLE(ISMULH_M, ISMULH_R);
|
||||
INST_HANDLE(IMUL_RCP, ISMULH_M);
|
||||
|
|
|
@ -49,7 +49,7 @@ enum randomx_flags {
|
|||
RANDOMX_FLAG_FULL_MEM = 4,
|
||||
RANDOMX_FLAG_JIT = 8,
|
||||
RANDOMX_FLAG_1GB_PAGES = 16,
|
||||
RANDOMX_FLAG_RYZEN = 64,
|
||||
RANDOMX_FLAG_AMD = 64,
|
||||
};
|
||||
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue