RandomX fixes

Intel JCC erratum fix and various other improvements, see more here: https://www.phoronix.com/scan.php?page=article&item=intel-jcc-microcode&num=1
2019-12-01 08:46:35 +01:00 · 2019-12-01 08:46:35 +01:00 · 84d7eb05f3
commit 84d7eb05f3
parent 8791261220
12 changed files with 320 additions and 40 deletions
--- a/src/crypto/randomx/jit_compiler_x86.cpp
+++ b/src/crypto/randomx/jit_compiler_x86.cpp
@ -29,6 +29,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <stdexcept>
 #include <cstring>
 #include <climits>
+#include <atomic>
 #include "crypto/randomx/jit_compiler_x86.hpp"
 #include "crypto/randomx/jit_compiler_x86_static.hpp"
 #include "crypto/randomx/superscalar.hpp"
@ -36,6 +37,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "crypto/randomx/reciprocal.h"
 #include "crypto/randomx/virtual_memory.hpp"

+#ifdef _MSC_VER
+#   include <intrin.h>
+#else
+#   include <cpuid.h>
+#endif
+
 namespace randomx {
 	/*

@ -108,7 +115,7 @@ namespace randomx {
 	const int32_t codeSshPrefetchSize = codeShhEnd - codeShhPrefetch;
 	const int32_t codeSshInitSize = codeProgramEnd - codeShhInit;

-	const int32_t epilogueOffset = CodeSize - epilogueSize;
+	const int32_t epilogueOffset = (CodeSize - epilogueSize) & ~63;
 	constexpr int32_t superScalarHashOffset = 32768;

 	static const uint8_t REX_ADD_RR[] = { 0x4d, 0x03 };
@ -183,6 +190,7 @@ namespace randomx {
 	static const uint8_t REX_ADD_I[] = { 0x49, 0x81 };
 	static const uint8_t REX_TEST[] = { 0x49, 0xF7 };
 	static const uint8_t JZ[] = { 0x0f, 0x84 };
+	static const uint8_t JZ_SHORT = 0x74;
 	static const uint8_t RET = 0xc3;
 	static const uint8_t LEA_32[] = { 0x41, 0x8d };
 	static const uint8_t MOVNTI[] = { 0x4c, 0x0f, 0xc3 };
@ -197,20 +205,100 @@ namespace randomx {
 	static const uint8_t NOP7[] = { 0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00 };
 	static const uint8_t NOP8[] = { 0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00 };

-//	static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
+	static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
+
+	static const uint8_t JMP_ALIGN_PREFIX[14][16] = {
+		{},
+		{0x2E},
+		{0x2E, 0x2E},
+		{0x2E, 0x2E, 0x2E},
+		{0x2E, 0x2E, 0x2E, 0x2E},
+		{0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+		{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	};
+
+	bool JitCompilerX86::BranchesWithin32B = false;

 	size_t JitCompilerX86::getCodeSize() {
 		return codePos < prologueSize ? 0 : codePos - prologueSize;
 	}

+    static inline void cpuid(uint32_t level, int32_t output[4])
+    {
+        memset(output, 0, sizeof(int32_t) * 4);
+
+#   ifdef _MSC_VER
+        __cpuid(output, static_cast<int>(level));
+#   else
+        __cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
+#   endif
+    }
+
+    // CPU-specific tweaks
+	void JitCompilerX86::applyTweaks() {
+		int32_t info[4];
+		cpuid(0, info);
+
+		int32_t manufacturer[4];
+		manufacturer[0] = info[1];
+		manufacturer[1] = info[3];
+		manufacturer[2] = info[2];
+		manufacturer[3] = 0;
+
+		if (strcmp((const char*)manufacturer, "GenuineIntel") == 0) {
+			struct
+			{
+				unsigned int stepping : 4;
+				unsigned int model : 4;
+				unsigned int family : 4;
+				unsigned int processor_type : 2;
+				unsigned int reserved1 : 2;
+				unsigned int ext_model : 4;
+				unsigned int ext_family : 8;
+				unsigned int reserved2 : 4;
+			} processor_info;
+
+			cpuid(1, info);
+			memcpy(&processor_info, info, sizeof(processor_info));
+
+			// Intel JCC erratum mitigation
+			if (processor_info.family == 6) {
+				const uint32_t model = processor_info.model | (processor_info.ext_model << 4);
+				const uint32_t stepping = processor_info.stepping;
+
+				// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
+				BranchesWithin32B =
+					((model == 0x4E) && (stepping == 0x3)) ||
+					((model == 0x55) && (stepping == 0x4)) ||
+					((model == 0x5E) && (stepping == 0x3)) ||
+					((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
+					((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
+					((model == 0xA6) && (stepping == 0x0)) ||
+					((model == 0xAE) && (stepping == 0xA));
+			}
+		}
+	}
+
+	static std::atomic<size_t> codeOffset;
+
 	JitCompilerX86::JitCompilerX86() {
-		code = (uint8_t*)allocExecutableMemory(CodeSize);
+		applyTweaks();
+		allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2);
+		// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
+		code = allocatedCode + (codeOffset.fetch_add(59 * 64) % CodeSize);
 		memcpy(code, codePrologue, prologueSize);
 		memcpy(code + epilogueOffset, codeEpilogue, epilogueSize);
 	}

 	JitCompilerX86::~JitCompilerX86() {
-		freePagedMemory(code, CodeSize);
+		freePagedMemory(allocatedCode, CodeSize);
 	}

 	void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg) {
@ -307,6 +395,22 @@ namespace randomx {
 		emit(RandomX_CurrentConfig.codePrefetchScratchpadTweaked, prefetchScratchpadSize, code, codePos);
 		memcpy(code + codePos, codeLoopStore, loopStoreSize);
 		codePos += loopStoreSize;
+
+		if (BranchesWithin32B) {
+			const uint32_t branch_begin = static_cast<uint32_t>(codePos);
+			const uint32_t branch_end = static_cast<uint32_t>(branch_begin + 9);
+
+			// If the jump crosses or touches 32-byte boundary, align it
+			if ((branch_begin ^ branch_end) >= 32) {
+				uint32_t alignment_size = 32 - (branch_begin & 31);
+				if (alignment_size > 8) {
+					emit(NOPX[alignment_size - 9], alignment_size - 8, code, codePos);
+					alignment_size = 8;
+				}
+				emit(NOPX[alignment_size - 1], alignment_size, code, codePos);
+			}
+		}
+
 		emit(SUB_EBX, code, codePos);
 		emit(JNZ, code, codePos);
 		emit32(prologueSize - codePos - 4, code, codePos);
@ -408,12 +512,13 @@ namespace randomx {
 		}
 	}

-	void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos, bool rax) {
-		emit(LEA_32, code, codePos);
-		emitByte(0x80 + instr.src + (rax ? 0 : 8), code, codePos);
-		if (instr.src == RegisterNeedsSib) {
-			emitByte(0x24, code, codePos);
-		}
+	template<bool rax>
+	FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, uint8_t* code, int& codePos) {
+		const uint32_t src = *((uint32_t*)&instr) & 0xFF0000;
+
+		*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + src;
+		codePos += (src == (RegisterNeedsSib << 16)) ? 4 : 3;
+
 		emit32(instr.getImm32(), code, codePos);
 		if (rax)
 			emitByte(AND_EAX_I, code, codePos);
@ -422,12 +527,14 @@ namespace randomx {
 		emit32(instr.getModMem() ? ScratchpadL1Mask : ScratchpadL2Mask, code, codePos);
 	}

-	void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) {
-		emit(LEA_32, code, codePos);
-		emitByte(0x80 + instr.dst, code, codePos);
-		if (instr.dst == RegisterNeedsSib) {
-			emitByte(0x24, code, codePos);
-		}
+	template void JitCompilerX86::genAddressReg<false>(const Instruction& instr, uint8_t* code, int& codePos);
+	template void JitCompilerX86::genAddressReg<true>(const Instruction& instr, uint8_t* code, int& codePos);
+
+	FORCE_INLINE void JitCompilerX86::genAddressRegDst(const Instruction& instr, uint8_t* code, int& codePos) {
+		const uint32_t dst = static_cast<uint32_t>(instr.dst) << 16;
+		*(uint32_t*)(code + codePos) = 0x24808d41 + dst;
+		codePos += (dst == (RegisterNeedsSib << 16)) ? 4 : 3;
+
 		emit32(instr.getImm32(), code, codePos);
 		emitByte(AND_EAX_I, code, codePos);
 		if (instr.getModCond() < StoreL3Condition) {
@ -438,7 +545,7 @@ namespace randomx {
 		}
 	}

-	void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) {
+	FORCE_INLINE void JitCompilerX86::genAddressImm(const Instruction& instr, uint8_t* code, int& codePos) {
 		emit32(instr.getImm32() & ScratchpadL3Mask, code, codePos);
 	}

@ -483,7 +590,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		if (instr.src != instr.dst) {
-			genAddressReg(instr, p, pos);
+			genAddressReg<true>(instr, p, pos);
 			emit32(template_IADD_M[instr.dst], p, pos);
 		}
 		else {
@ -523,7 +630,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		if (instr.src != instr.dst) {
-			genAddressReg(instr, p, pos);
+			genAddressReg<true>(instr, p, pos);
 			emit(REX_SUB_RM, p, pos);
 			emitByte(0x04 + 8 * instr.dst, p, pos);
 			emitByte(0x06, p, pos);
@ -561,7 +668,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		if (instr.src != instr.dst) {
-			genAddressReg(instr, p, pos);
+			genAddressReg<true>(instr, p, pos);
 			emit(REX_IMUL_RM, p, pos);
 			emitByte(0x04 + 8 * instr.dst, p, pos);
 			emitByte(0x06, p, pos);
@ -596,7 +703,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		if (instr.src != instr.dst) {
-			genAddressReg(instr, p, pos, false);
+			genAddressReg<false>(instr, p, pos);
 			emit(REX_MOV_RR64, p, pos);
 			emitByte(0xc0 + instr.dst, p, pos);
 			emit(REX_MUL_MEM, p, pos);
@ -635,7 +742,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		if (instr.src != instr.dst) {
-			genAddressReg(instr, p, pos, false);
+			genAddressReg<false>(instr, p, pos);
 			emit(REX_MOV_RR64, p, pos);
 			emitByte(0xc0 + instr.dst, p, pos);
 			emit(REX_IMUL_MEM, p, pos);
@ -704,7 +811,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		if (instr.src != instr.dst) {
-			genAddressReg(instr, p, pos);
+			genAddressReg<true>(instr, p, pos);
 			emit(REX_XOR_RM, p, pos);
 			emitByte(0x04 + 8 * instr.dst, p, pos);
 			emitByte(0x06, p, pos);
@ -801,7 +908,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		const uint32_t dst = instr.dst % RegisterCountFlt;
-		genAddressReg(instr, p, pos);
+		genAddressReg<true>(instr, p, pos);
 		emit(REX_CVTDQ2PD_XMM12, p, pos);
 		emit(REX_ADDPD, p, pos);
 		emitByte(0xc4 + 8 * dst, p, pos);
@ -826,7 +933,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		const uint32_t dst = instr.dst % RegisterCountFlt;
-		genAddressReg(instr, p, pos);
+		genAddressReg<true>(instr, p, pos);
 		emit(REX_CVTDQ2PD_XMM12, p, pos);
 		emit(REX_SUBPD, p, pos);
 		emitByte(0xc4 + 8 * dst, p, pos);
@ -862,7 +969,7 @@ namespace randomx {
 		int pos = codePos;
 		
 		const uint32_t dst = instr.dst % RegisterCountFlt;
-		genAddressReg(instr, p, pos);
+		genAddressReg<true>(instr, p, pos);
 		emit(REX_CVTDQ2PD_XMM12, p, pos);
 		emit(REX_ANDPS_XMM12, p, pos);
 		emit(REX_DIVPD, p, pos);
@ -902,19 +1009,39 @@ namespace randomx {
 		uint8_t* const p = code;
 		int pos = codePos;
 		
-		int reg = instr.dst;
+		const int reg = instr.dst;
+		int32_t jmp_offset = registerUsage[reg] - (pos + 16);
+
+		if (BranchesWithin32B) {
+			const uint32_t branch_begin = static_cast<uint32_t>(pos + 7);
+			const uint32_t branch_end = static_cast<uint32_t>(branch_begin + ((jmp_offset >= -128) ? 9 : 13));
+
+			// If the jump crosses or touches 32-byte boundary, align it
+			if ((branch_begin ^ branch_end) >= 32) {
+				const uint32_t alignment_size = 32 - (branch_begin & 31);
+				jmp_offset -= alignment_size;
+				emit(JMP_ALIGN_PREFIX[alignment_size], alignment_size, p, pos);
+			}
+		}
+
 		emit(REX_ADD_I, p, pos);
 		emitByte(0xc0 + reg, p, pos);
-		int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset;
-		uint32_t imm = instr.getImm32() | (1UL << shift);
-		if (RandomX_CurrentConfig.JumpOffset > 0 || shift > 0)
-			imm &= ~(1UL << (shift - 1));
+		const int shift = instr.getModCond() + RandomX_CurrentConfig.JumpOffset;
+		const uint32_t imm = (instr.getImm32() | (1UL << shift)) & ~(1UL << (shift - 1));
 		emit32(imm, p, pos);
 		emit(REX_TEST, p, pos);
 		emitByte(0xc0 + reg, p, pos);
 		emit32(RandomX_CurrentConfig.ConditionMask_Calculated << shift, p, pos);
-		emit(JZ, p, pos);
-		emit32(registerUsage[reg] - (pos + 4), p, pos);
+
+		if (jmp_offset >= -128) {
+			emitByte(JZ_SHORT, p, pos);
+			emitByte(jmp_offset, p, pos);
+		}
+		else {
+			emit(JZ, p, pos);
+			emit32(jmp_offset - 4, p, pos);
+		}
+
 		//mark all registers as used
 		uint64_t* r = (uint64_t*) registerUsage;
 		uint64_t k = pos;