RandomX: optimized loading from scratchpad
Prefetches scratchpad data as soon as possible to calculate data address for the next load. Up to ~1.4% speedup on Ryzen 7 3700X @ 4.1 GHz, RAM 3200 MHz 14-14-14-28 with optimized sub-timings: Variant|Before H/S|After H/S -------|----------|--------- rx/0|8663|8777 rx/wow|9867|10009 rx/loki|8652|8731
This commit is contained in:
parent
01b2c952ea
commit
2322e3bcf7
9 changed files with 75 additions and 21 deletions
|
@ -37,7 +37,10 @@
|
|||
#define WINABI
|
||||
#endif
|
||||
|
||||
.global DECL(randomx_prefetch_scratchpad)
|
||||
.global DECL(randomx_prefetch_scratchpad_end)
|
||||
.global DECL(randomx_program_prologue)
|
||||
.global DECL(randomx_program_prologue_first_load)
|
||||
.global DECL(randomx_program_loop_begin)
|
||||
.global DECL(randomx_program_loop_load)
|
||||
.global DECL(randomx_program_start)
|
||||
|
@ -61,6 +64,16 @@
|
|||
|
||||
#define db .byte
|
||||
|
||||
DECL(randomx_prefetch_scratchpad):
|
||||
mov rdx, rax
|
||||
and eax, RANDOMX_SCRATCHPAD_MASK
|
||||
prefetcht0 [rsi+rax]
|
||||
ror rdx, 32
|
||||
and edx, RANDOMX_SCRATCHPAD_MASK
|
||||
prefetcht0 [rsi+rdx]
|
||||
|
||||
DECL(randomx_prefetch_scratchpad_end):
|
||||
|
||||
.balign 64
|
||||
DECL(randomx_program_prologue):
|
||||
#if defined(WINABI)
|
||||
|
@ -71,6 +84,14 @@ DECL(randomx_program_prologue):
|
|||
movapd xmm13, xmmword ptr [mantissaMask+rip]
|
||||
movapd xmm14, xmmword ptr [exp240+rip]
|
||||
movapd xmm15, xmmword ptr [scaleMask+rip]
|
||||
|
||||
DECL(randomx_program_prologue_first_load):
|
||||
xor rax, r8
|
||||
xor rax, r8
|
||||
mov rdx, rax
|
||||
and eax, RANDOMX_SCRATCHPAD_MASK
|
||||
ror rdx, 32
|
||||
and edx, RANDOMX_SCRATCHPAD_MASK
|
||||
jmp DECL(randomx_program_loop_begin)
|
||||
|
||||
.balign 64
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue