Dataset initialization with AVX2 (WIP)

This commit is contained in:
SChernykh 2020-12-18 14:53:54 +01:00
parent 6b21a51a2f
commit 515a85e66c
17 changed files with 721 additions and 90 deletions

View file

@ -49,8 +49,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef _MSC_VER
# include <intrin.h>
#else
# include <cpuid.h>
#endif
static bool hugePagesJIT = false;
@ -116,6 +114,11 @@ namespace randomx {
#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
#define codeDatasetInit ADDR(randomx_dataset_init)
#define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue)
#define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end)
#define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue)
#define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load)
#define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
#define codeLoopStore ADDR(randomx_program_loop_store)
#define codeLoopEnd ADDR(randomx_program_loop_end)
#define codeEpilogue ADDR(randomx_program_epilogue)
@ -132,7 +135,12 @@ namespace randomx {
#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
#define loopStoreSize (codeLoopEnd - codeLoopStore)
#define datasetInitSize (codeEpilogue - codeDatasetInit)
#define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit)
#define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue)
#define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end)
#define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue)
#define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load)
#define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch)
#define epilogueSize (codeShhLoad - codeEpilogue)
#define codeSshLoadSize (codeShhPrefetch - codeShhLoad)
#define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch)
@ -192,17 +200,6 @@ namespace randomx {
xmrig::VirtualMemory::protectRX(p1, p2 - p1);
}
static inline void cpuid(uint32_t level, int32_t output[4])
{
memset(output, 0, sizeof(int32_t) * 4);
# ifdef _MSC_VER
__cpuid(output, static_cast<int>(level));
# else
__cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
# endif
}
# ifdef _MSC_VER
static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); }
# else
@ -215,14 +212,11 @@ namespace randomx {
JitCompilerX86::JitCompilerX86(bool hugePagesEnable) {
BranchesWithin32B = xmrig::Cpu::info()->jccErratum();
int32_t info[4];
cpuid(1, info);
hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
hasAVX = xmrig::Cpu::info()->hasAVX();
hasAVX2 = xmrig::Cpu::info()->hasAVX2();
hasXOP = xmrig::Cpu::info()->hasXOP();
cpuid(0x80000001, info);
hasXOP = ((info[2] & (1 << 11)) != 0);
allocatedSize = CodeSize * 2;
allocatedSize = hasAVX2 ? (CodeSize * 4) : (CodeSize * 2);
allocatedCode = static_cast<uint8_t*>(allocExecutableMemory(allocatedSize,
# ifdef XMRIG_SECURE_JIT
false
@ -304,14 +298,49 @@ namespace randomx {
template<size_t N>
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) {
uint8_t* p = code;
if (hasAVX2) {
codePos = 0;
emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos);
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
SuperscalarProgram& prog = programs[j];
uint32_t pos = codePos;
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
generateSuperscalarCode<true>(prog(i), p, pos);
}
codePos = pos;
emit(codeShhLoad, codeSshLoadSize, code, codePos);
emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos);
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
codePos += 3;
emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
uint8_t* p = code + codePos;
emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos);
p[3] += prog.getAddressRegister() << 3;
}
}
emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos);
// Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label
constexpr int32_t prologue_size = 320;
*(int32_t*)(code + codePos - 4) = prologue_size - codePos;
emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos);
return;
}
memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
codePos = superScalarHashOffset + codeSshInitSize;
for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
SuperscalarProgram& prog = programs[j];
for (unsigned i = 0; i < prog.getSize(); ++i) {
Instruction& instr = prog(i);
generateSuperscalarCode(instr);
uint32_t pos = codePos;
for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
generateSuperscalarCode<false>(prog(i), p, pos);
}
codePos = pos;
emit(codeShhLoad, codeSshLoadSize, code, codePos);
if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
@ -326,7 +355,10 @@ namespace randomx {
void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]);
void JitCompilerX86::generateDatasetInitCode() {
memcpy(code, codeDatasetInit, datasetInitSize);
// AVX2 code is generated in generateSuperscalarHash()
if (!hasAVX2) {
memcpy(code, codeDatasetInit, datasetInitSize);
}
}
void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
@ -405,85 +437,243 @@ namespace randomx {
emit32(epilogueOffset - codePos - 4, code, codePos);
}
void JitCompilerX86::generateSuperscalarCode(Instruction& instr) {
static constexpr uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
static constexpr uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
static constexpr uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
static constexpr uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
static constexpr uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
static constexpr uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
static constexpr uint8_t REX_81[] = { 0x49, 0x81 };
static constexpr uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
static constexpr uint8_t REX_LEA[] = { 0x4f, 0x8d };
static constexpr uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
static constexpr uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
static constexpr uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
template<bool AVX2>
FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) {
switch ((SuperscalarInstructionType)instr.opcode)
{
case randomx::SuperscalarInstructionType::ISUB_R:
emit(REX_SUB_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16);
codePos += 3;
if (AVX2) {
emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IXOR_R:
emit(REX_XOR_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16);
codePos += 3;
if (AVX2) {
emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IADD_RS:
emit(REX_LEA, code, codePos);
emitByte(0x04 + 8 * instr.dst, code, codePos);
genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos);
emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos);
if (AVX2) {
if (instr.getModShift()) {
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.src;
p[4] = instr.getModShift();
p[8] += instr.dst * 9;
}
else {
emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
}
}
break;
case randomx::SuperscalarInstructionType::IMUL_R:
emit(REX_IMUL_RR, code, codePos);
emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos);
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x35, 0xF4, 0xD8,
0xC5, 0xBD, 0xF4, 0xC0,
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
0xC5, 0xFD, 0x73, 0xF0, 0x20,
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
0xC5, 0xAD, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.dst;
p[21] += instr.dst * 8 + instr.src;
p[29] -= instr.dst * 8;
p[31] += instr.dst;
p[41] += instr.dst * 9;
}
break;
case randomx::SuperscalarInstructionType::IROR_C:
emit(REX_ROT_I8, code, codePos);
emitByte(0xc8 + instr.dst, code, codePos);
emitByte(instr.getImm32() & 63, code, codePos);
{
const uint32_t shift = instr.getImm32() & 63;
emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[4] = shift;
p[8] += instr.dst;
p[9] = 64 - shift;
p[14] += instr.dst * 8;
}
}
break;
case randomx::SuperscalarInstructionType::IADD_C7:
case randomx::SuperscalarInstructionType::IADD_C8:
case randomx::SuperscalarInstructionType::IADD_C9:
emit(REX_81, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit32(instr.getImm32(), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
p[12] += instr.dst * 8;
p[24] -= instr.dst * 8;
p[26] += instr.dst * 8;
}
else {
*(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16);
codePos += 3;
emit32(instr.getImm32(), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IXOR_C7:
case randomx::SuperscalarInstructionType::IXOR_C8:
case randomx::SuperscalarInstructionType::IXOR_C9:
emit(REX_XOR_RI, code, codePos);
emitByte(0xf0 + instr.dst, code, codePos);
emit32(instr.getImm32(), code, codePos);
if (AVX2) {
static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 };
uint8_t* p = code + codePos;
emit(t, code, codePos);
*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
p[12] += instr.dst * 8;
p[24] -= instr.dst * 8;
p[26] += instr.dst * 8;
}
else {
*(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16);
codePos += 3;
emit32(instr.getImm32(), code, codePos);
}
break;
case randomx::SuperscalarInstructionType::IMULH_R:
emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R, code, codePos);
emitByte(0xe0 + instr.src, code, codePos);
emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
codePos += 3;
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x3D, 0xF4, 0xD8,
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
0xC4, 0xC1, 0x3D, 0xF4, 0xC1,
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
0xC4, 0x41, 0x25, 0xEF, 0xC6,
0xC4, 0x41, 0x25, 0xD4, 0xDC,
0xC4, 0x41, 0x25, 0xD4, 0xDA,
0xC4, 0x41, 0x25, 0xEF, 0xCE,
0xC4, 0x42, 0x3D, 0x37, 0xC1,
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
0xC5, 0xBD, 0xD4, 0xC0,
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
0xC5, 0xA5, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.src;
p[20] -= instr.dst * 8;
p[27] += instr.dst * 8;
p[67] += instr.dst * 9;
p[77] += instr.dst * 9;
}
break;
case randomx::SuperscalarInstructionType::ISMULH_R:
emit(REX_MOV_RR64, code, codePos);
emitByte(0xc0 + instr.dst, code, codePos);
emit(REX_MUL_R, code, codePos);
emitByte(0xe8 + instr.src, code, codePos);
emit(REX_MOV_R64R, code, codePos);
emitByte(0xc2 + 8 * instr.dst, code, codePos);
*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16);
codePos += 3;
*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
codePos += 3;
if (AVX2) {
static const uint8_t t[] = {
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC5, 0xB5, 0x73, 0xD0, 0x20,
0xC5, 0x7D, 0xF4, 0xD0,
0xC5, 0x3D, 0xF4, 0xD8,
0xC4, 0x41, 0x7D, 0xF4, 0xE1,
0xC4, 0x41, 0x3D, 0xF4, 0xE9,
0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
0xC4, 0x41, 0x25, 0xEF, 0xC6,
0xC4, 0x41, 0x25, 0xD4, 0xDC,
0xC4, 0x41, 0x25, 0xD4, 0xDA,
0xC4, 0x41, 0x25, 0xEF, 0xCE,
0xC4, 0x42, 0x3D, 0x37, 0xC1,
0xC4, 0x41, 0x3D, 0xDB, 0xC7,
0xC4, 0x41, 0x15, 0xD4, 0xE8,
0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
0xC4, 0x41, 0x15, 0xD4, 0xC3,
0xC4, 0x41, 0x35, 0xEF, 0xC9,
0xC4, 0x62, 0x35, 0x37, 0xD0,
0xC4, 0x62, 0x35, 0x37, 0xD8,
0xC5, 0x2D, 0xDB, 0xD0,
0xC5, 0x25, 0xDB, 0xD8,
0xC4, 0x41, 0x3D, 0xFB, 0xC2,
0xC4, 0xC1, 0x3D, 0xFB, 0xC3
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[3] += instr.dst;
p[8] += instr.src;
p[11] -= instr.dst * 8;
p[13] += instr.src;
p[17] += instr.src;
p[20] -= instr.dst * 8;
p[89] += instr.dst;
p[94] += instr.src;
p[98] += instr.src;
p[102] += instr.dst;
p[112] += instr.dst * 8;
}
break;
case randomx::SuperscalarInstructionType::IMUL_RCP:
emit(MOV_RAX_I, code, codePos);
*(uint32_t*)(code + codePos) = 0x0000B848UL;
codePos += 2;
emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos);
emit(REX_IMUL_RM, code, codePos);
emitByte(0xc0 + 8 * instr.dst, code, codePos);
emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos);
if (AVX2) {
static const uint8_t t[] = {
0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF,
0xC5, 0xBD, 0x73, 0xD0, 0x20,
0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20,
0xC4, 0x41, 0x7D, 0xF4, 0xD4,
0xC5, 0x35, 0xF4, 0xD8,
0xC4, 0xC1, 0x3D, 0xF4, 0xC4,
0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
0xC5, 0xFD, 0x73, 0xF0, 0x20,
0xC4, 0x41, 0x2D, 0xD4, 0xD3,
0xC5, 0xAD, 0xD4, 0xC0
};
uint8_t* p = code + codePos;
emit(t, code, codePos);
p[12] += instr.dst;
p[22] -= instr.dst * 8;
p[28] += instr.dst;
p[33] += instr.dst * 8;
p[41] -= instr.dst * 8;
p[43] += instr.dst;
p[53] += instr.dst * 9;
}
break;
default:
UNREACHABLE;
}
}
template void JitCompilerX86::generateSuperscalarCode<false>(Instruction&, uint8_t*, uint32_t&);
template void JitCompilerX86::generateSuperscalarCode<true>(Instruction&, uint8_t*, uint32_t&);
template<bool rax>
FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) {
*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16);
@ -563,10 +753,6 @@ namespace randomx {
codePos = pos;
}
void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos) {
emitByte((scale << 6) | (index << 3) | base, code, codePos);
}
void JitCompilerX86::h_ISUB_R(const Instruction& instr) {
uint8_t* const p = code;
uint32_t pos = codePos;