From 21a56c9cbfb9f6e4cfaeb20ea111e8d00303a068 Mon Sep 17 00:00:00 2001
From: SChernykh <sergey.v.chernykh@gmail.com>
Date: Tue, 27 Aug 2019 16:12:13 +0200
Subject: [PATCH] Updated RandomX

---
 src/crypto/randomx/argon2_core.c        |  13 ---
 src/crypto/randomx/blake2_generator.cpp |   2 +-
 src/crypto/randomx/blake2_generator.hpp |   2 +-
 src/crypto/randomx/bytecode_machine.cpp |   2 +-
 src/crypto/randomx/common.hpp           |   2 +-
 src/crypto/randomx/intrin_portable.h    | 139 ++++++++++++++++++++++--
 src/crypto/randomx/jit_compiler_x86.cpp |   4 +-
 src/crypto/randomx/randomx.cpp          |   9 +-
 src/crypto/randomx/superscalar.cpp      |  14 +--
 9 files changed, 151 insertions(+), 36 deletions(-)

diff --git a/src/crypto/randomx/argon2_core.c b/src/crypto/randomx/argon2_core.c
index f9babbc4..97176608 100644
--- a/src/crypto/randomx/argon2_core.c
+++ b/src/crypto/randomx/argon2_core.c
@@ -263,19 +263,6 @@ int rxa2_validate_inputs(const argon2_context *context) {
 		return ARGON2_INCORRECT_PARAMETER;
 	}
 
-	if (NULL == context->out) {
-		return ARGON2_OUTPUT_PTR_NULL;
-	}
-
-	/* Validate output length */
-	if (ARGON2_MIN_OUTLEN > context->outlen) {
-		return ARGON2_OUTPUT_TOO_SHORT;
-	}
-
-	if (ARGON2_MAX_OUTLEN < context->outlen) {
-		return ARGON2_OUTPUT_TOO_LONG;
-	}
-
 	/* Validate password (required param) */
 	if (NULL == context->pwd) {
 		if (0 != context->pwdlen) {
diff --git a/src/crypto/randomx/blake2_generator.cpp b/src/crypto/randomx/blake2_generator.cpp
index 83789129..edfe2e34 100644
--- a/src/crypto/randomx/blake2_generator.cpp
+++ b/src/crypto/randomx/blake2_generator.cpp
@@ -46,7 +46,7 @@ namespace randomx {
 		return data[dataIndex++];
 	}
 
-	uint32_t Blake2Generator::getInt32() {
+	uint32_t Blake2Generator::getUInt32() {
 		checkData(4);
 		auto ret = load32(&data[dataIndex]);
 		dataIndex += 4;
diff --git a/src/crypto/randomx/blake2_generator.hpp b/src/crypto/randomx/blake2_generator.hpp
index b5ac0801..5e7f61f2 100644
--- a/src/crypto/randomx/blake2_generator.hpp
+++ b/src/crypto/randomx/blake2_generator.hpp
@@ -36,7 +36,7 @@ namespace randomx {
 	public:
 		Blake2Generator(const void* seed, size_t seedSize, int nonce = 0);
 		uint8_t getByte();
-		uint32_t getInt32();
+		uint32_t getUInt32();
 	private:
 		void checkData(const size_t);
 
diff --git a/src/crypto/randomx/bytecode_machine.cpp b/src/crypto/randomx/bytecode_machine.cpp
index 6c51b86c..55a63935 100644
--- a/src/crypto/randomx/bytecode_machine.cpp
+++ b/src/crypto/randomx/bytecode_machine.cpp
@@ -244,7 +244,7 @@ namespace randomx {
 
 		if (opcode < RandomX_CurrentConfig.CEIL_IMUL_RCP) {
 			uint64_t divisor = instr.getImm32();
-			if (!isPowerOf2(divisor)) {
+			if (!isZeroOrPowerOf2(divisor)) {
 				auto dst = instr.dst % RegistersCount;
 				ibc.type = InstructionType::IMUL_R;
 				ibc.idst = &nreg->r[dst];
diff --git a/src/crypto/randomx/common.hpp b/src/crypto/randomx/common.hpp
index 31b18ce4..da36f2c5 100644
--- a/src/crypto/randomx/common.hpp
+++ b/src/crypto/randomx/common.hpp
@@ -137,7 +137,7 @@ namespace randomx {
 	constexpr int RegisterNeedsDisplacement = 5; //x86 r13 register
 	constexpr int RegisterNeedsSib = 4; //x86 r12 register
 
-	inline bool isPowerOf2(uint64_t x) {
+	inline bool isZeroOrPowerOf2(uint64_t x) {
 		return (x & (x - 1)) == 0;
 	}
 
diff --git a/src/crypto/randomx/intrin_portable.h b/src/crypto/randomx/intrin_portable.h
index e4916096..338d6d89 100644
--- a/src/crypto/randomx/intrin_portable.h
+++ b/src/crypto/randomx/intrin_portable.h
@@ -376,11 +376,131 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 
 #define RANDOMX_DEFAULT_FENV
 
-void rx_reset_float_state();
+#elif defined(__aarch64__)
 
-void rx_set_rounding_mode(uint32_t mode);
+#include <stdlib.h>
+#include <arm_neon.h>
+#include <arm_acle.h>
 
-#else //end altivec
+typedef uint8x16_t rx_vec_i128;
+typedef float64x2_t rx_vec_f128;
+
+#define rx_aligned_alloc(size, align) aligned_alloc(align, size)
+#define rx_aligned_free(a) free(a)
+
+inline void rx_prefetch_nta(void* ptr) {
+	asm volatile ("prfm pldl1strm, [%0]\n" : : "r" (ptr));
+}
+
+FORCE_INLINE rx_vec_f128 rx_load_vec_f128(const double* pd) {
+	return vld1q_f64((const float64_t*)pd);
+}
+
+FORCE_INLINE void rx_store_vec_f128(double* mem_addr, rx_vec_f128 val) {
+	vst1q_f64((float64_t*)mem_addr, val);
+}
+
+FORCE_INLINE rx_vec_f128 rx_swap_vec_f128(rx_vec_f128 a) {
+	float64x2_t temp;
+	temp = vcopyq_laneq_f64(temp, 1, a, 1);
+	a = vcopyq_laneq_f64(a, 1, a, 0);
+	return vcopyq_laneq_f64(a, 0, temp, 1);
+}
+
+FORCE_INLINE rx_vec_f128 rx_set_vec_f128(uint64_t x1, uint64_t x0) {
+	uint64x2_t temp0 = vdupq_n_u64(x0);
+	uint64x2_t temp1 = vdupq_n_u64(x1);
+	return vreinterpretq_f64_u64(vcopyq_laneq_u64(temp0, 1, temp1, 0));
+}
+
+FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
+	return vreinterpretq_f64_u64(vdupq_n_u64(x));
+}
+
+#define rx_add_vec_f128 vaddq_f64
+#define rx_sub_vec_f128 vsubq_f64
+#define rx_mul_vec_f128 vmulq_f64
+#define rx_div_vec_f128 vdivq_f64
+#define rx_sqrt_vec_f128 vsqrtq_f64
+
+FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	return vreinterpretq_f64_u8(veorq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
+}
+
+FORCE_INLINE rx_vec_f128 rx_and_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	return vreinterpretq_f64_u8(vandq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
+}
+
+FORCE_INLINE rx_vec_f128 rx_or_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
+	return vreinterpretq_f64_u8(vorrq_u8(vreinterpretq_u8_f64(a), vreinterpretq_u8_f64(b)));
+}
+
+#ifdef __ARM_FEATURE_CRYPTO
+
+
+FORCE_INLINE rx_vec_i128 rx_aesenc_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
+	const uint8x16_t zero = { 0 };
+	return vaesmcq_u8(vaeseq_u8(a, zero)) ^ key;
+}
+
+FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 a, rx_vec_i128 key) {
+	const uint8x16_t zero = { 0 };
+	return vaesimcq_u8(vaesdq_u8(a, zero)) ^ key;
+}
+
+#define HAVE_AES
+
+#endif
+
+#define rx_xor_vec_i128 veorq_u8
+
+FORCE_INLINE int rx_vec_i128_x(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 0);
+}
+
+FORCE_INLINE int rx_vec_i128_y(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 1);
+}
+
+FORCE_INLINE int rx_vec_i128_z(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 2);
+}
+
+FORCE_INLINE int rx_vec_i128_w(rx_vec_i128 a) {
+	return vgetq_lane_s32(vreinterpretq_s32_u8(a), 3);
+}
+
+FORCE_INLINE rx_vec_i128 rx_set_int_vec_i128(int _I3, int _I2, int _I1, int _I0) {
+	int32_t data[4];
+	data[0] = _I0;
+	data[1] = _I1;
+	data[2] = _I2;
+	data[3] = _I3;
+	return vreinterpretq_u8_s32(vld1q_s32(data));
+};
+
+#define rx_xor_vec_i128 veorq_u8
+
+FORCE_INLINE rx_vec_i128 rx_load_vec_i128(const rx_vec_i128* mem_addr) {
+	return vld1q_u8((const uint8_t*)mem_addr);
+}
+
+FORCE_INLINE void rx_store_vec_i128(rx_vec_i128* mem_addr, rx_vec_i128 val) {
+	vst1q_u8((uint8_t*)mem_addr, val);
+}
+
+FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
+	double lo = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 0));
+	double hi = unsigned32ToSigned2sCompl(load32((uint8_t*)addr + 4));
+	rx_vec_f128 x;
+	x = vsetq_lane_f64(lo, x, 0);
+	x = vsetq_lane_f64(hi, x, 1);
+	return x;
+}
+
+#define RANDOMX_DEFAULT_FENV
+
+#else //portable fallback
 
 #include <cstdint>
 #include <stdexcept>
@@ -487,7 +607,6 @@ FORCE_INLINE rx_vec_f128 rx_set1_vec_f128(uint64_t x) {
 	return v;
 }
 
-
 FORCE_INLINE rx_vec_f128 rx_xor_vec_f128(rx_vec_f128 a, rx_vec_f128 b) {
 	rx_vec_f128 x;
 	x.i.u64[0] = a.i.u64[0] ^ b.i.u64[0];
@@ -578,10 +697,6 @@ FORCE_INLINE rx_vec_f128 rx_cvt_packed_int_vec_f128(const void* addr) {
 
 #define RANDOMX_DEFAULT_FENV
 
-void rx_reset_float_state();
-
-void rx_set_rounding_mode(uint32_t mode);
-
 #endif
 
 #ifndef HAVE_AES
@@ -598,6 +713,14 @@ FORCE_INLINE rx_vec_i128 rx_aesdec_vec_i128(rx_vec_i128 v, rx_vec_i128 rkey) {
 }
 #endif
 
+#ifdef RANDOMX_DEFAULT_FENV
+
+void rx_reset_float_state();
+
+void rx_set_rounding_mode(uint32_t mode);
+
+#endif
+
 double loadDoublePortable(const void* addr);
 uint64_t mulh(uint64_t, uint64_t);
 int64_t smulh(int64_t, int64_t);
diff --git a/src/crypto/randomx/jit_compiler_x86.cpp b/src/crypto/randomx/jit_compiler_x86.cpp
index dd579c69..043874a8 100644
--- a/src/crypto/randomx/jit_compiler_x86.cpp
+++ b/src/crypto/randomx/jit_compiler_x86.cpp
@@ -197,7 +197,7 @@ namespace randomx {
 //	static const uint8_t* NOPX[] = { NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8 };
 
 	size_t JitCompilerX86::getCodeSize() {
-		return codePos - prologueSize;
+		return codePos < prologueSize ? 0 : codePos - prologueSize;
 	}
 
 	JitCompilerX86::JitCompilerX86() {
@@ -580,7 +580,7 @@ namespace randomx {
 
 	void JitCompilerX86::h_IMUL_RCP(Instruction& instr, int i) {
 		uint64_t divisor = instr.getImm32();
-		if (!isPowerOf2(divisor)) {
+		if (!isZeroOrPowerOf2(divisor)) {
 			registerUsage[instr.dst] = i;
 			emit(MOV_RAX_I);
 			emit64(randomx_reciprocal_fast(divisor));
diff --git a/src/crypto/randomx/randomx.cpp b/src/crypto/randomx/randomx.cpp
index ffd9e2c5..a5f6bc08 100644
--- a/src/crypto/randomx/randomx.cpp
+++ b/src/crypto/randomx/randomx.cpp
@@ -44,12 +44,14 @@ RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
 	ScratchpadL2_Size = 131072;
 	ScratchpadL3_Size = 1048576;
 
+	RANDOMX_FREQ_IADD_RS = 25;
 	RANDOMX_FREQ_IROR_R = 10;
 	RANDOMX_FREQ_IROL_R = 0;
 	RANDOMX_FREQ_FSWAP_R = 8;
 	RANDOMX_FREQ_FADD_R = 20;
 	RANDOMX_FREQ_FSUB_R = 20;
 	RANDOMX_FREQ_FMUL_R = 20;
+	RANDOMX_FREQ_CBRANCH = 16;
 
 	fillAes4Rx4_Key[0] = rx_set_int_vec_i128(0xcf359e95, 0x141f82b7, 0x7ffbe4a6, 0xf890465d);
 	fillAes4Rx4_Key[1] = rx_set_int_vec_i128(0x6741ffdc, 0xbd5c5ac3, 0xfee8278a, 0x6a55c450);
@@ -68,6 +70,9 @@ RandomX_ConfigurationLoki::RandomX_ConfigurationLoki()
 	ArgonSalt = "RandomXL\x12";
 	ProgramSize = 320;
 	ProgramCount = 7;
+
+	RANDOMX_FREQ_IADD_RS = 25;
+	RANDOMX_FREQ_CBRANCH = 16;
 }
 
 RandomX_ConfigurationBase::RandomX_ConfigurationBase()
@@ -87,7 +92,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
 	, ProgramCount(8)
 	, JumpBits(8)
 	, JumpOffset(8)
-	, RANDOMX_FREQ_IADD_RS(25)
+	, RANDOMX_FREQ_IADD_RS(16)
 	, RANDOMX_FREQ_IADD_M(7)
 	, RANDOMX_FREQ_ISUB_R(16)
 	, RANDOMX_FREQ_ISUB_M(7)
@@ -113,7 +118,7 @@ RandomX_ConfigurationBase::RandomX_ConfigurationBase()
 	, RANDOMX_FREQ_FMUL_R(32)
 	, RANDOMX_FREQ_FDIV_M(4)
 	, RANDOMX_FREQ_FSQRT_R(6)
-	, RANDOMX_FREQ_CBRANCH(16)
+	, RANDOMX_FREQ_CBRANCH(25)
 	, RANDOMX_FREQ_CFROUND(1)
 	, RANDOMX_FREQ_ISTORE(16)
 	, RANDOMX_FREQ_NOP(0)
diff --git a/src/crypto/randomx/superscalar.cpp b/src/crypto/randomx/superscalar.cpp
index 4ed993f4..aaa91f62 100644
--- a/src/crypto/randomx/superscalar.cpp
+++ b/src/crypto/randomx/superscalar.cpp
@@ -329,7 +329,7 @@ namespace randomx {
 			return false;
 
 		if (availableRegisters.size() > 1) {
-			index = gen.getInt32() % availableRegisters.size();
+			index = gen.getUInt32() % availableRegisters.size();
 		}
 		else {
 			index = 0;
@@ -442,7 +442,7 @@ namespace randomx {
 			case SuperscalarInstructionType::IADD_C8:
 			case SuperscalarInstructionType::IADD_C9: {
 				mod_ = 0;
-				imm32_ = gen.getInt32();
+				imm32_ = gen.getUInt32();
 				opGroup_ = SuperscalarInstructionType::IADD_C7;
 				opGroupPar_ = -1;
 			} break;
@@ -451,7 +451,7 @@ namespace randomx {
 			case SuperscalarInstructionType::IXOR_C8:
 			case SuperscalarInstructionType::IXOR_C9: {
 				mod_ = 0;
-				imm32_ = gen.getInt32();
+				imm32_ = gen.getUInt32();
 				opGroup_ = SuperscalarInstructionType::IXOR_C7;
 				opGroupPar_ = -1;
 			} break;
@@ -461,7 +461,7 @@ namespace randomx {
 				mod_ = 0;
 				imm32_ = 0;
 				opGroup_ = SuperscalarInstructionType::IMULH_R;
-				opGroupPar_ = gen.getInt32();
+				opGroupPar_ = gen.getUInt32();
 			} break;
 
 			case SuperscalarInstructionType::ISMULH_R: {
@@ -469,14 +469,14 @@ namespace randomx {
 				mod_ = 0;
 				imm32_ = 0;
 				opGroup_ = SuperscalarInstructionType::ISMULH_R;
-				opGroupPar_ = gen.getInt32();
+				opGroupPar_ = gen.getUInt32();
 			} break;
 
 			case SuperscalarInstructionType::IMUL_RCP: {
 				mod_ = 0;
 				do {
-					imm32_ = gen.getInt32();
-				} while ((imm32_ & (imm32_ - 1)) == 0);
+					imm32_ = gen.getUInt32();
+				} while (isZeroOrPowerOf2(imm32_));
 				opGroup_ = SuperscalarInstructionType::IMUL_RCP;
 				opGroupPar_ = -1;
 			} break;