RandomX: optimized loading from scratchpad

Prefetches scratchpad data as soon as possible to calculate data address for the next load.

Up to ~1.4% speedup on Ryzen 7 3700X @ 4.1 GHz, RAM 3200 MHz 14-14-14-28 with optimized sub-timings:
Variant|Before H/S|After H/S
-------|----------|---------
rx/0|8663|8777
rx/wow|9867|10009
rx/loki|8652|8731
This commit is contained in:
SChernykh 2019-09-11 19:10:01 +02:00
parent 01b2c952ea
commit 2322e3bcf7
9 changed files with 75 additions and 21 deletions

View file

@ -1,5 +1,3 @@
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
lea rcx, [rsi+rax]
push rcx
xor r8, qword ptr [rcx+0]
@ -10,8 +8,6 @@
xor r13, qword ptr [rcx+40]
xor r14, qword ptr [rcx+48]
xor r15, qword ptr [rcx+56]
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
lea rcx, [rsi+rdx]
push rcx
cvtdq2pd xmm0, qword ptr [rcx+0]

View file

@ -1,4 +1,3 @@
xor eax, eax
pop rcx
mov qword ptr [rcx+0], r8
mov qword ptr [rcx+8], r9