Merge xmrig v6.12.2 into master

This commit is contained in:
MoneroOcean 2021-05-31 15:06:47 +00:00
commit 28ad107de8
28 changed files with 987 additions and 506 deletions

View file

@ -22,6 +22,7 @@
mov rsi, rdx ;# uint8_t* scratchpad
mov rax, rbp
ror rbp, 32
;# zero integer registers
xor r8, r8

View file

@ -35,6 +35,7 @@
mov rbx, r9 ;# loop counter
mov rax, rbp
ror rbp, 32
;# zero integer registers
xor r8, r8

View file

@ -1,17 +1,16 @@
mov ecx, ebp ;# ecx = ma
and ecx, RANDOMX_DATASET_BASE_MASK
xor r8, qword ptr [rdi+rcx]
ror rbp, 32 ;# swap "ma" and "mx"
xor rbp, rax ;# modify "mx"
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
mov edx, ebp ;# edx = ma
and edx, RANDOMX_DATASET_BASE_MASK
lea rcx, [rdi+rdx] ;# dataset cache line
xor r8, qword ptr [rcx+0]
xor r9, qword ptr [rcx+8]
xor r10, qword ptr [rcx+16]
xor r11, qword ptr [rcx+24]
xor r12, qword ptr [rcx+32]
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]

View file

@ -1,17 +0,0 @@
mov rcx, rbp ;# ecx = ma
shr rcx, 32
and ecx, RANDOMX_DATASET_BASE_MASK
xor r8, qword ptr [rdi+rcx]
xor rbp, rax ;# modify "mx"
mov edx, ebp ;# edx = mx
and edx, RANDOMX_DATASET_BASE_MASK
prefetchnta byte ptr [rdi+rdx]
ror rbp, 32 ;# swap "ma" and "mx"
xor r9, qword ptr [rdi+rcx+8]
xor r10, qword ptr [rdi+rcx+16]
xor r11, qword ptr [rdi+rcx+24]
xor r12, qword ptr [rdi+rcx+32]
xor r13, qword ptr [rdi+rcx+40]
xor r14, qword ptr [rdi+rcx+48]
xor r15, qword ptr [rdi+rcx+56]

View file

@ -7,4 +7,4 @@
xor r13, qword ptr [rsp+16]
xor r14, qword ptr [rsp+8]
xor r15, qword ptr [rsp+0]
add rsp, 72
add rsp, 200

View file

@ -1,4 +1,4 @@
sub rsp, 72
sub rsp, 200
mov qword ptr [rsp+64], rbx
mov qword ptr [rsp+56], r8
mov qword ptr [rsp+48], r9
@ -8,10 +8,10 @@
mov qword ptr [rsp+16], r13
mov qword ptr [rsp+8], r14
mov qword ptr [rsp+0], r15
xor rbp, rax ;# modify "mx"
ror rbp, 32 ;# swap "ma" and "mx"
mov ebx, ebp ;# ecx = ma
and ebx, RANDOMX_DATASET_BASE_MASK
shr ebx, 6 ;# ebx = Dataset block number
xor rbp, rax ;# modify "mx"
mov rbx, rbp ;# ebx = ma
shr rbx, 38
and ebx, RANDOMX_DATASET_BASE_MASK / 64 ;# ebx = Dataset block number
;# add ebx, datasetOffset / 64
;# call 32768

View file

@ -304,6 +304,9 @@ literal_v14: .fill 2,8,0
literal_v15: .fill 2,8,0
DECL(randomx_program_aarch64_vm_instructions_end):
# Calculate dataset pointer for dataset read
# Do it here to break false dependency from readReg2 and readReg3 (see next line)
lsr x10, x9, 32
# mx ^= r[readReg2] ^ r[readReg3];
eor x9, x9, x18
@ -321,8 +324,6 @@ DECL(randomx_program_aarch64_cacheline_align_mask1):
# mx <-> ma
ror x9, x9, 32
# Calculate dataset pointer for dataset read
mov w10, w9
DECL(randomx_program_aarch64_cacheline_align_mask2):
# Actual mask will be inserted by JIT compiler
and x10, x10, 1

View file

@ -110,13 +110,12 @@ namespace randomx {
#define ADDR(x) ((uint8_t*)&x)
# endif
#define codePrefetchScratchpad ADDR(randomx_prefetch_scratchpad)
#define codePrefetchScratchpadEnd ADDR(randomx_prefetch_scratchpad_end)
#define codePrologue ADDR(randomx_program_prologue)
#define codeLoopBegin ADDR(randomx_program_loop_begin)
#define codeLoopLoad ADDR(randomx_program_loop_load)
#define codeLoopLoadXOP ADDR(randomx_program_loop_load_xop)
#define codeProgamStart ADDR(randomx_program_start)
#define codeReadDataset ADDR(randomx_program_read_dataset)
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
#define codeDatasetInit ADDR(randomx_dataset_init)
@ -134,10 +133,10 @@ namespace randomx {
#define codeShhEnd ADDR(randomx_sshash_end)
#define codeShhInit ADDR(randomx_sshash_init)
#define prefetchScratchpadSize (codePrefetchScratchpadEnd - codePrefetchScratchpad)
#define prologueSize (codeLoopBegin - codePrologue)
#define loopLoadSize (codeLoopLoadXOP - codeLoopLoad)
#define loopLoadXOPSize (codeProgamStart - codeLoopLoadXOP)
#define readDatasetSize (codeReadDatasetLightSshInit - codeReadDataset)
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
#define loopStoreSize (codeLoopEnd - codeLoopStore)
@ -321,26 +320,13 @@ namespace randomx {
vm_flags = flags;
generateProgramPrologue(prog, pcfg);
uint8_t* p;
uint32_t n;
if (flags & RANDOMX_FLAG_AMD) {
p = RandomX_CurrentConfig.codeReadDatasetRyzenTweaked;
n = RandomX_CurrentConfig.codeReadDatasetRyzenTweakedSize;
}
else {
p = RandomX_CurrentConfig.codeReadDatasetTweaked;
n = RandomX_CurrentConfig.codeReadDatasetTweakedSize;
}
memcpy(code + codePos, p, n);
codePos += n;
emit(codeReadDataset, readDatasetSize, code, codePos);
generateProgramEpilogue(prog, pcfg);
}
void JitCompilerX86::generateProgramLight(Program& prog, ProgramConfiguration& pcfg, uint32_t datasetOffset) {
generateProgramPrologue(prog, pcfg);
emit(RandomX_CurrentConfig.codeReadDatasetLightSshInitTweaked, readDatasetLightInitSize, code, codePos);
emit(codeReadDatasetLightSshInit, readDatasetLightInitSize, code, codePos);
*(uint32_t*)(code + codePos) = 0xc381;
codePos += 2;
emit32(datasetOffset / CacheLineSize, code, codePos);
@ -467,7 +453,7 @@ namespace randomx {
void JitCompilerX86::generateProgramEpilogue(Program& prog, ProgramConfiguration& pcfg) {
*(uint64_t*)(code + codePos) = 0xc03349c08b49ull + (static_cast<uint64_t>(pcfg.readReg0) << 16) + (static_cast<uint64_t>(pcfg.readReg1) << 40);
codePos += 6;
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos);
emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, RandomX_CurrentConfig.codePrefetchScratchpadTweakedSize, code, codePos);
memcpy(code + codePos, codeLoopStore, loopStoreSize);
codePos += loopStoreSize;

View file

@ -38,6 +38,7 @@
#endif
.global DECL(randomx_prefetch_scratchpad)
.global DECL(randomx_prefetch_scratchpad_bmi2)
.global DECL(randomx_prefetch_scratchpad_end)
.global DECL(randomx_program_prologue)
.global DECL(randomx_program_prologue_first_load)
@ -47,7 +48,6 @@
.global DECL(randomx_program_loop_load_xop)
.global DECL(randomx_program_start)
.global DECL(randomx_program_read_dataset)
.global DECL(randomx_program_read_dataset_ryzen)
.global DECL(randomx_program_read_dataset_sshash_init)
.global DECL(randomx_program_read_dataset_sshash_fin)
.global DECL(randomx_program_loop_store)
@ -80,6 +80,13 @@ DECL(randomx_prefetch_scratchpad):
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
DECL(randomx_prefetch_scratchpad_bmi2):
rorx rdx, rax, 32
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
DECL(randomx_prefetch_scratchpad_end):
.balign 64
@ -132,9 +139,6 @@ DECL(randomx_program_start):
DECL(randomx_program_read_dataset):
#include "asm/program_read_dataset.inc"
DECL(randomx_program_read_dataset_ryzen):
#include "asm/program_read_dataset_ryzen.inc"
DECL(randomx_program_read_dataset_sshash_init):
#include "asm/program_read_dataset_sshash_init.inc"

View file

@ -29,6 +29,7 @@ IFDEF RAX
_RANDOMX_JITX86_STATIC SEGMENT PAGE READ EXECUTE
PUBLIC randomx_prefetch_scratchpad
PUBLIC randomx_prefetch_scratchpad_bmi2
PUBLIC randomx_prefetch_scratchpad_end
PUBLIC randomx_program_prologue
PUBLIC randomx_program_prologue_first_load
@ -38,7 +39,6 @@ PUBLIC randomx_program_loop_load
PUBLIC randomx_program_loop_load_xop
PUBLIC randomx_program_start
PUBLIC randomx_program_read_dataset
PUBLIC randomx_program_read_dataset_ryzen
PUBLIC randomx_program_read_dataset_sshash_init
PUBLIC randomx_program_read_dataset_sshash_fin
PUBLIC randomx_dataset_init
@ -70,6 +70,14 @@ randomx_prefetch_scratchpad PROC
prefetcht0 [rsi+rdx]
randomx_prefetch_scratchpad ENDP
randomx_prefetch_scratchpad_bmi2 PROC
rorx rdx, rax, 32
and eax, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rax]
and edx, RANDOMX_SCRATCHPAD_MASK
prefetcht0 [rsi+rdx]
randomx_prefetch_scratchpad_bmi2 ENDP
randomx_prefetch_scratchpad_end PROC
randomx_prefetch_scratchpad_end ENDP
@ -127,10 +135,6 @@ randomx_program_read_dataset PROC
include asm/program_read_dataset.inc
randomx_program_read_dataset ENDP
randomx_program_read_dataset_ryzen PROC
include asm/program_read_dataset_ryzen.inc
randomx_program_read_dataset_ryzen ENDP
randomx_program_read_dataset_sshash_init PROC
include asm/program_read_dataset_sshash_init.inc
randomx_program_read_dataset_sshash_init ENDP

View file

@ -30,6 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
extern "C" {
void randomx_prefetch_scratchpad();
void randomx_prefetch_scratchpad_bmi2();
void randomx_prefetch_scratchpad_end();
void randomx_program_prologue();
void randomx_program_prologue_first_load();
@ -39,7 +40,6 @@ extern "C" {
void randomx_program_loop_load_xop();
void randomx_program_start();
void randomx_program_read_dataset();
void randomx_program_read_dataset_ryzen();
void randomx_program_read_dataset_sshash_init();
void randomx_program_read_dataset_sshash_fin();
void randomx_program_loop_store();

View file

@ -191,27 +191,17 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
const uint8_t* b = addr(randomx_sshash_end);
memcpy(codeShhPrefetchTweaked, a, b - a);
}
{
const uint8_t* a = addr(randomx_program_read_dataset);
const uint8_t* b = addr(randomx_program_read_dataset_ryzen);
memcpy(codeReadDatasetTweaked, a, b - a);
codeReadDatasetTweakedSize = b - a;
}
{
const uint8_t* a = addr(randomx_program_read_dataset_ryzen);
const uint8_t* b = addr(randomx_program_read_dataset_sshash_init);
memcpy(codeReadDatasetRyzenTweaked, a, b - a);
codeReadDatasetRyzenTweakedSize = b - a;
}
{
const uint8_t* a = addr(randomx_program_read_dataset_sshash_init);
const uint8_t* b = addr(randomx_program_read_dataset_sshash_fin);
memcpy(codeReadDatasetLightSshInitTweaked, a, b - a);
}
{
const uint8_t* a = addr(randomx_prefetch_scratchpad);
if (xmrig::Cpu::info()->hasBMI2()) {
const uint8_t* a = addr(randomx_prefetch_scratchpad_bmi2);
const uint8_t* b = addr(randomx_prefetch_scratchpad_end);
memcpy(codePrefetchScratchpadTweaked, a, b - a);
codePrefetchScratchpadTweakedSize = b - a;
}
else {
const uint8_t* a = addr(randomx_prefetch_scratchpad);
const uint8_t* b = addr(randomx_prefetch_scratchpad_bmi2);
memcpy(codePrefetchScratchpadTweaked, a, b - a);
codePrefetchScratchpadTweakedSize = b - a;
}
# endif
}
@ -250,13 +240,15 @@ void RandomX_ConfigurationBase::Apply()
*(uint32_t*)(codeReadDatasetTweaked + 23) = DatasetBaseMask;
*(uint32_t*)(codeReadDatasetLightSshInitTweaked + 59) = DatasetBaseMask;
*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
const bool hasBMI2 = xmrig::Cpu::info()->hasBMI2();
*(uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 7 : 4)) = ScratchpadL3Mask64_Calculated;
*(uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 17 : 18)) = ScratchpadL3Mask64_Calculated;
// Apply scratchpad prefetch mode
{
uint32_t* a = (uint32_t*)(codePrefetchScratchpadTweaked + 8);
uint32_t* b = (uint32_t*)(codePrefetchScratchpadTweaked + 22);
uint32_t* a = (uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 11 : 8));
uint32_t* b = (uint32_t*)(codePrefetchScratchpadTweaked + (hasBMI2 ? 21 : 22));
switch (scratchpadPrefetchMode)
{
@ -323,7 +315,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
INST_HANDLE(IMUL_M, IMUL_R);
#if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->hasBMI2()) {
if (hasBMI2) {
INST_HANDLE2(IMULH_R, IMULH_R_BMI2, IMUL_M);
INST_HANDLE2(IMULH_M, IMULH_M_BMI2, IMULH_R);
}
@ -365,7 +357,7 @@ typedef void(randomx::JitCompilerX86::* InstructionGeneratorX86_2)(const randomx
#endif
#if defined(_M_X64) || defined(__x86_64__)
if (xmrig::Cpu::info()->hasBMI2()) {
if (hasBMI2) {
INST_HANDLE2(CFROUND, CFROUND_BMI2, CBRANCH);
}
else

View file

@ -126,12 +126,8 @@ struct RandomX_ConfigurationBase
rx_vec_i128 fillAes4Rx4_Key[8];
uint8_t codeShhPrefetchTweaked[20];
uint8_t codeReadDatasetTweaked[64];
uint32_t codeReadDatasetTweakedSize;
uint8_t codeReadDatasetRyzenTweaked[72];
uint32_t codeReadDatasetRyzenTweakedSize;
uint8_t codeReadDatasetLightSshInitTweaked[68];
uint8_t codePrefetchScratchpadTweaked[32];
uint8_t codePrefetchScratchpadTweaked[28];
uint32_t codePrefetchScratchpadTweakedSize;
uint32_t CacheLineAlignMask_Calculated;