RandomX: optimized loading from scratchpad
Prefetches scratchpad data as soon as possible to calculate data address for the next load. Up to ~1.4% speedup on Ryzen 7 3700X @ 4.1 GHz, RAM 3200 MHz 14-14-14-28 with optimized sub-timings: Variant|Before H/S|After H/S -------|----------|--------- rx/0|8663|8777 rx/wow|9867|10009 rx/loki|8652|8731
This commit is contained in:
parent
01b2c952ea
commit
2322e3bcf7
9 changed files with 75 additions and 21 deletions
|
@ -149,9 +149,9 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
|
|||
memcpy(codeReadDatasetLightSshInitTweaked, a, b - a);
|
||||
}
|
||||
{
|
||||
const uint8_t* a = (const uint8_t*)&randomx_program_loop_load;
|
||||
const uint8_t* b = (const uint8_t*)&randomx_program_start;
|
||||
memcpy(codeLoopLoadTweaked, a, b - a);
|
||||
const uint8_t* a = (const uint8_t*)&randomx_prefetch_scratchpad;
|
||||
const uint8_t* b = (const uint8_t*)&randomx_prefetch_scratchpad_end;
|
||||
memcpy(codePrefetchScratchpadTweaked, a, b - a);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -177,8 +177,8 @@ void RandomX_ConfigurationBase::Apply()
|
|||
ScratchpadL3Mask64_Calculated = ((ScratchpadL3_Size / sizeof(uint64_t)) / 8 - 1) * 64;
|
||||
|
||||
#if defined(_M_X64) || defined(__x86_64__)
|
||||
*(uint32_t*)(codeLoopLoadTweaked + 4) = ScratchpadL3Mask64_Calculated;
|
||||
*(uint32_t*)(codeLoopLoadTweaked + 50) = ScratchpadL3Mask64_Calculated;
|
||||
*(uint32_t*)(codePrefetchScratchpadTweaked + 4) = ScratchpadL3Mask64_Calculated;
|
||||
*(uint32_t*)(codePrefetchScratchpadTweaked + 18) = ScratchpadL3Mask64_Calculated;
|
||||
#endif
|
||||
|
||||
ConditionMask_Calculated = (1 << JumpBits) - 1;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue