This commit is contained in:
MoneroOcean 2020-02-02 11:22:09 -08:00
commit baeb45e8a1
30 changed files with 267 additions and 88 deletions

View file

@ -1,5 +1,5 @@
;# save VM register values
add rsp, 24
add rsp, 40
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9

View file

@ -1,5 +1,5 @@
lea rcx, [rsi+rax]
mov [rsp+8], rcx
mov [rsp+16], rcx
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
@ -9,7 +9,7 @@
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
lea rcx, [rsi+rdx]
mov [rsp+16], rcx
mov [rsp+24], rcx
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16]
@ -18,11 +18,11 @@
cvtdq2pd xmm5, qword ptr [rcx+40]
cvtdq2pd xmm6, qword ptr [rcx+48]
cvtdq2pd xmm7, qword ptr [rcx+56]
andps xmm4, xmm13
andps xmm5, xmm13
andps xmm6, xmm13
andps xmm7, xmm13
orps xmm4, xmm14
orps xmm5, xmm14
orps xmm6, xmm14
orps xmm7, xmm14
andpd xmm4, xmm13
andpd xmm5, xmm13
andpd xmm6, xmm13
andpd xmm7, xmm13
orpd xmm4, xmm14
orpd xmm5, xmm14
orpd xmm6, xmm14
orpd xmm7, xmm14

View file

@ -0,0 +1,24 @@
lea rcx, [rsi+rax]
mov [rsp+16], rcx
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
lea rcx, [rsi+rdx]
mov [rsp+24], rcx
cvtdq2pd xmm0, qword ptr [rcx+0]
cvtdq2pd xmm1, qword ptr [rcx+8]
cvtdq2pd xmm2, qword ptr [rcx+16]
cvtdq2pd xmm3, qword ptr [rcx+24]
cvtdq2pd xmm4, qword ptr [rcx+32]
cvtdq2pd xmm5, qword ptr [rcx+40]
cvtdq2pd xmm6, qword ptr [rcx+48]
cvtdq2pd xmm7, qword ptr [rcx+56]
vpcmov xmm4, xmm4, xmm14, xmm13
vpcmov xmm5, xmm5, xmm14, xmm13
vpcmov xmm6, xmm6, xmm14, xmm13
vpcmov xmm7, xmm7, xmm14, xmm13

View file

@ -1,4 +1,4 @@
mov rcx, [rsp+16]
mov rcx, [rsp+24]
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9
mov qword ptr [rcx+16], r10
@ -7,7 +7,7 @@
mov qword ptr [rcx+40], r13
mov qword ptr [rcx+48], r14
mov qword ptr [rcx+56], r15
mov rcx, [rsp+8]
mov rcx, [rsp+16]
xorpd xmm0, xmm4
xorpd xmm1, xmm5
xorpd xmm2, xmm6

View file

@ -1,5 +1,5 @@
mantissaMask:
db 255, 255, 255, 255, 255, 255, 255, 0, 255, 255, 255, 255, 255, 255, 255, 0
db 0, 0, 192, 255, 255, 255, 255, 0, 0, 0, 192, 255, 255, 255, 255, 0
exp240:
db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
scaleMask:

View file

@ -89,6 +89,7 @@ namespace randomx {
const uint8_t* codePrologue = (uint8_t*)&randomx_program_prologue;
const uint8_t* codeLoopBegin = (uint8_t*)&randomx_program_loop_begin;
const uint8_t* codeLoopLoad = (uint8_t*)&randomx_program_loop_load;
const uint8_t* codeLoopLoadXOP = (uint8_t*)&randomx_program_loop_load_xop;
const uint8_t* codeProgamStart = (uint8_t*)&randomx_program_start;
const uint8_t* codeReadDatasetLightSshInit = (uint8_t*)&randomx_program_read_dataset_sshash_init;
const uint8_t* codeReadDatasetLightSshFin = (uint8_t*)&randomx_program_read_dataset_sshash_fin;
@ -104,7 +105,8 @@ namespace randomx {
const int32_t prefetchScratchpadSize = codePrefetchScratchpadEnd - codePrefetchScratchpad;
const int32_t prologueSize = codeLoopBegin - codePrologue;
const int32_t loopLoadSize = codeProgamStart - codeLoopLoad;
const int32_t loopLoadSize = codeLoopLoadXOP - codeLoopLoad;
const int32_t loopLoadXOPSize = codeProgamStart - codeLoopLoadXOP;
const int32_t readDatasetLightInitSize = codeReadDatasetLightSshFin - codeReadDatasetLightSshInit;
const int32_t readDatasetLightFinSize = codeLoopStore - codeReadDatasetLightSshFin;
const int32_t loopStoreSize = codeLoopEnd - codeLoopStore;
@ -184,6 +186,7 @@ namespace randomx {
static const uint8_t REX_XOR_RAX_R64[] = { 0x49, 0x33 };
static const uint8_t REX_XCHG[] = { 0x4d, 0x87 };
static const uint8_t REX_ANDPS_XMM12[] = { 0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6 };
static const uint8_t REX_VPCMOV_XMM12[] = { 0x8F, 0x48, 0x18, 0xA2, 0xE6, 0xD0 };
static const uint8_t REX_PADD[] = { 0x66, 0x44, 0x0f };
static const uint8_t PADD_OPCODES[] = { 0xfc, 0xfd, 0xfe, 0xd4 };
static const uint8_t CALL = 0xe8;
@ -295,11 +298,23 @@ namespace randomx {
cpuid(1, info);
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
cpuid(0x80000001, info);
hasXOP = ((info[2] & (1 << 11)) != 0);
allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
memcpy(code, codePrologue, prologueSize);
if (hasXOP) {
memcpy(code + prologueSize, codeLoopLoadXOP, loopLoadXOPSize);
}
else {
memcpy(code + prologueSize, codeLoopLoad, loopLoadSize);
}
memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
codePosFirst = prologueSize + (hasXOP ? loopLoadXOPSize : loopLoadSize);
# ifdef XMRIG_FIX_RYZEN
mainLoopBounds.first = code + prologueSize;
mainLoopBounds.second = code + epilogueOffset;
@ -317,7 +332,7 @@ namespace randomx {
uint8_t* p;
uint32_t n;
if (flags & RANDOMX_FLAG_RYZEN) {
if (flags & RANDOMX_FLAG_AMD) {
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
}
@ -385,7 +400,7 @@ namespace randomx {
*(uint32_t*)(code + codePos + 10) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
*(uint32_t*)(code + codePos + 20) = RandomX_CurrentConfig.ScratchpadL3Mask64_Calculated;
if (hasAVX) {
uint32_t* p = (uint32_t*)(code + codePos + 32);
uint32_t* p = (uint32_t*)(code + codePos + 67);
*p = (*p & 0xFF000000U) | 0x0077F8C5U;
}
@ -393,10 +408,8 @@ namespace randomx {
xmrig::Rx::setMainLoopBounds(mainLoopBounds);
# endif
codePos = prologueSize;
memcpy(code + codePos - 48, &pcfg.eMask, sizeof(pcfg.eMask));
memcpy(code + codePos, codeLoopLoad, loopLoadSize);
codePos += loopLoadSize;
memcpy(code + prologueSize - 48, &pcfg.eMask, sizeof(pcfg.eMask));
codePos = codePosFirst;
//mark all registers as used
uint64_t* r = (uint64_t*)registerUsage;
@ -708,14 +721,31 @@ namespace randomx {
uint8_t* const p = code;
int pos = codePos;
const uint32_t dst = instr.dst;
emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.dst, p, pos);
emitByte(0xc0 + dst, p, pos);
emit(REX_MUL_R, p, pos);
emitByte(0xe0 + instr.src, p, pos);
emit(REX_MOV_R64R, p, pos);
emitByte(0xc2 + 8 * instr.dst, p, pos);
emitByte(0xc2 + 8 * dst, p, pos);
registerUsage[instr.dst] = pos;
registerUsage[dst] = pos;
codePos = pos;
}
void JitCompilerX86::h_IMULH_R_BMI2(const Instruction& instr) {
uint8_t* const p = code;
int pos = codePos;
const uint32_t src = instr.src;
const uint32_t dst = instr.dst;
*(uint32_t*)(p + pos) = 0xC4D08B49 + (dst << 16);
*(uint32_t*)(p + pos + 4) = 0xC0F6FB42 + (dst << 27) + (src << 24);
pos += 8;
registerUsage[dst] = pos;
codePos = pos;
}
@ -743,6 +773,29 @@ namespace randomx {
codePos = pos;
}
void JitCompilerX86::h_IMULH_M_BMI2(const Instruction& instr) {
uint8_t* const p = code;
int pos = codePos;
const uint64_t src = instr.src;
const uint64_t dst = instr.dst;
if (src != dst) {
genAddressReg<false>(instr, p, pos);
*(uint32_t*)(p + pos) = static_cast<uint32_t>(0xC4D08B49 + (dst << 16));
*(uint64_t*)(p + pos + 4) = 0x0E04F6FB62ULL + (dst << 27);
pos += 9;
}
else {
*(uint64_t*)(p + pos) = 0x86F6FB62C4D08B49ULL + (dst << 16) + (dst << 59);
*(uint32_t*)(p + pos + 8) = instr.getImm32() & ScratchpadL3Mask;
pos += 12;
}
registerUsage[dst] = pos;
codePos = pos;
}
void JitCompilerX86::h_ISMULH_R(const Instruction& instr) {
uint8_t* const p = code;
int pos = codePos;
@ -992,7 +1045,12 @@ namespace randomx {
const uint32_t dst = instr.dst % RegisterCountFlt;
genAddressReg<true>(instr, p, pos);
emit(REX_CVTDQ2PD_XMM12, p, pos);
emit(REX_ANDPS_XMM12, p, pos);
if (hasXOP) {
emit(REX_VPCMOV_XMM12, p, pos);
}
else {
emit(REX_ANDPS_XMM12, p, pos);
}
emit(REX_DIVPD, p, pos);
emitByte(0xe4 + 8 * dst, p, pos);
@ -1014,18 +1072,21 @@ namespace randomx {
uint8_t* const p = code;
int pos = codePos;
emit(REX_MOV_RR64, p, pos);
emitByte(0xc0 + instr.src, p, pos);
int rotate = (13 - (instr.getImm32() & 63)) & 63;
if (rotate != 0) {
emit(ROL_RAX, p, pos);
emitByte(rotate, p, pos);
}
if (vm_flags & RANDOMX_FLAG_RYZEN) {
emit(AND_OR_MOV_LDMXCSR_RYZEN, p, pos);
const uint32_t src = instr.src;
*(uint32_t*)(p + pos) = 0x00C08B49 + (src << 16);
const int rotate = (static_cast<int>(instr.getImm32() & 63) - 2) & 63;
*(uint32_t*)(p + pos + 3) = 0x00C8C148 + (rotate << 24);
if (vm_flags & RANDOMX_FLAG_AMD) {
*(uint64_t*)(p + pos + 7) = 0x742024443B0CE083ULL;
*(uint8_t*)(p + pos + 15) = 8;
*(uint64_t*)(p + pos + 16) = 0x202444890414AE0FULL;
pos += 24;
}
else {
emit(AND_OR_MOV_LDMXCSR, p, pos);
*(uint64_t*)(p + pos + 7) = 0x0414AE0F0CE083ULL;
pos += 14;
}
codePos = pos;

View file

@ -73,10 +73,12 @@ namespace randomx {
std::pair<const void*, const void*> mainLoopBounds;
# endif
int32_t codePos;
int32_t codePosFirst;
uint32_t vm_flags;
static bool BranchesWithin32B;
bool hasAVX;
bool hasXOP;
static void applyTweaks();
void generateProgramPrologue(Program&, ProgramConfiguration&);
@ -121,7 +123,9 @@ namespace randomx {
void h_IMUL_R(const Instruction&);
void h_IMUL_M(const Instruction&);
void h_IMULH_R(const Instruction&);
void h_IMULH_R_BMI2(const Instruction&);
void h_IMULH_M(const Instruction&);
void h_IMULH_M_BMI2(const Instruction&);
void h_ISMULH_R(const Instruction&);
void h_ISMULH_M(const Instruction&);
void h_IMUL_RCP(const Instruction&);

View file

@ -43,6 +43,7 @@
.global DECL(randomx_program_prologue_first_load)
.global DECL(randomx_program_loop_begin)
.global DECL(randomx_program_loop_load)
.global DECL(randomx_program_loop_load_xop)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_ryzen)
@ -93,8 +94,12 @@ DECL(randomx_program_prologue_first_load):
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
sub rsp, 24
stmxcsr dword ptr [rsp]
sub rsp, 40
mov dword ptr [rsp], 0x9FC0
mov dword ptr [rsp+4], 0xBFC0
mov dword ptr [rsp+8], 0xDFC0
mov dword ptr [rsp+12], 0xFFC0
mov dword ptr [rsp+32], -1
nop
nop
nop
@ -110,6 +115,9 @@ DECL(randomx_program_loop_begin):
DECL(randomx_program_loop_load):
#include "asm/program_loop_load.inc"
DECL(randomx_program_loop_load_xop):
#include "asm/program_loop_load_xop.inc"
DECL(randomx_program_start):
nop

View file

@ -34,6 +34,7 @@ PUBLIC randomx_program_prologue
PUBLIC randomx_program_prologue_first_load
PUBLIC randomx_program_loop_begin
PUBLIC randomx_program_loop_load
PUBLIC randomx_program_loop_load_xop
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_ryzen
@ -81,8 +82,12 @@ randomx_program_prologue_first_load PROC
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
sub rsp, 24
stmxcsr dword ptr [rsp]
sub rsp, 40
mov dword ptr [rsp], 9FC0h
mov dword ptr [rsp+4], 0BFC0h
mov dword ptr [rsp+8], 0DFC0h
mov dword ptr [rsp+12], 0FFC0h
mov dword ptr [rsp+32], -1
nop
nop
nop
@ -101,6 +106,10 @@ randomx_program_loop_load PROC
include asm/program_loop_load.inc
randomx_program_loop_load ENDP
randomx_program_loop_load_xop PROC
include asm/program_loop_load_xop.inc
randomx_program_loop_load_xop ENDP
randomx_program_start PROC
nop
randomx_program_start ENDP

View file

@ -35,6 +35,7 @@ extern "C" {
void randomx_program_prologue_first_load();
void randomx_program_loop_begin();
void randomx_program_loop_load();
void randomx_program_loop_load_xop();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_read_dataset_ryzen();

View file

@ -41,6 +41,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "crypto/randomx/jit_compiler_a64_static.hpp"
#endif
#include "backend/cpu/Cpu.h"
#include <cassert>
RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
@ -236,14 +238,29 @@ void RandomX_ConfigurationBase::Apply()
CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \
for (; k < CEIL_##x; ++k) { JIT_HANDLE(x, prev); }
#define INST_HANDLE2(x, func_name, prev) \
CEIL_##x = CEIL_##prev + RANDOMX_FREQ_##x; \
for (; k < CEIL_##x; ++k) { JIT_HANDLE(func_name, prev); }
INST_HANDLE(IADD_RS, NULL);
INST_HANDLE(IADD_M, IADD_RS);
INST_HANDLE(ISUB_R, IADD_M);
INST_HANDLE(ISUB_M, ISUB_R);
INST_HANDLE(IMUL_R, ISUB_M);
INST_HANDLE(IMUL_M, IMUL_R);
INST_HANDLE(IMULH_R, IMUL_M);
INST_HANDLE(IMULH_M, IMULH_R);
#if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->hasBMI2()) {
INST_HANDLE2(IMULH_R, IMULH_R_BMI2, IMUL_M);
INST_HANDLE2(IMULH_M, IMULH_M_BMI2, IMULH_R);
}
else
#endif
{
INST_HANDLE(IMULH_R, IMUL_M);
INST_HANDLE(IMULH_M, IMULH_R);
}
INST_HANDLE(ISMULH_R, IMULH_M);
INST_HANDLE(ISMULH_M, ISMULH_R);
INST_HANDLE(IMUL_RCP, ISMULH_M);

View file

@ -49,7 +49,7 @@ enum randomx_flags {
RANDOMX_FLAG_FULL_MEM = 4,
RANDOMX_FLAG_JIT = 8,
RANDOMX_FLAG_1GB_PAGES = 16,
RANDOMX_FLAG_RYZEN = 64,
RANDOMX_FLAG_AMD = 64,
};