Merge xmrig v6.7.0 into master

2020-12-23 06:03:02 +00:00 · 2020-12-23 06:03:02 +00:00 · 1719879f7e
commit 1719879f7e
parent 966aaa72ca 9d256a1e9b
249 changed files with 6814 additions and 6134 deletions
--- a/src/crypto/randomx/aes_hash.cpp
+++ b/src/crypto/randomx/aes_hash.cpp
@ -28,12 +28,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include <thread>
 #include <vector>
+#include <array>

 #include "crypto/randomx/aes_hash.hpp"
-#include "crypto/randomx/soft_aes.h"
-#include "crypto/randomx/randomx.h"
 #include "base/tools/Chrono.h"
-#include "base/tools/Profiler.h"
+#include "crypto/randomx/randomx.h"
+#include "crypto/randomx/soft_aes.h"
+#include "crypto/rx/Profiler.h"

 #define AES_HASH_1R_STATE0 0xd7983aad, 0xcc82db47, 0x9fa856de, 0x92b52c0d
 #define AES_HASH_1R_STATE1 0xace78057, 0xf59e125a, 0x15c7b798, 0x338d996e
@ -371,7 +372,7 @@ hashAndFillAes1Rx4_impl* softAESImpl = &hashAndFillAes1Rx4<1,1>;
 void SelectSoftAESImpl(size_t threadsCount)
 {
  constexpr int test_length_ms = 100;
-  const std::vector<hashAndFillAes1Rx4_impl *> impl = {
+  const std::array<hashAndFillAes1Rx4_impl *, 4> impl = {
    &hashAndFillAes1Rx4<1,1>,
    &hashAndFillAes1Rx4<2,1>,
    &hashAndFillAes1Rx4<2,2>,
--- a/src/crypto/randomx/asm/program_sshash_avx2_constants.inc
+++ b/src/crypto/randomx/asm/program_sshash_avx2_constants.inc
@ -0,0 +1,28 @@
+r0_avx2_increments:
+	db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0
+mul_hi_avx2_data:
+	db 0,0,0,0,1,0,0,0
+r0_avx2_mul:
+	;#/ 6364136223846793005
+	db 45, 127, 149, 76, 45, 244, 81, 88
+r1_avx2_add:
+	;#/ 9298411001130361340
+	db 252, 161, 245, 89, 138, 151, 10, 129
+r2_avx2_add:
+	;#/ 12065312585734608966
+	db 70, 216, 194, 56, 223, 153, 112, 167
+r3_avx2_add:
+	;#/ 9306329213124626780
+	db 92, 73, 34, 191, 28, 185, 38, 129
+r4_avx2_add:
+	;#/ 5281919268842080866
+	db 98, 138, 159, 23, 151, 37, 77, 73
+r5_avx2_add:
+	;#/ 10536153434571861004
+	db 12, 236, 170, 206, 185, 239, 55, 146
+r6_avx2_add:
+	;#/ 3398623926847679864
+	db 120, 45, 230, 108, 116, 86, 42, 47
+r7_avx2_add:
+	;#/ 9549104520008361294
+	db 78, 229, 44, 182, 247, 59, 133, 132
--- a/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc
+++ b/src/crypto/randomx/asm/program_sshash_avx2_epilogue.inc
@ -0,0 +1,31 @@
+	add rsp, 40
+	pop r9
+
+	movdqu xmm0,  xmmword ptr [rsp]
+	movdqu xmm1,  xmmword ptr [rsp + 16]
+	movdqu xmm2,  xmmword ptr [rsp + 32]
+	movdqu xmm3,  xmmword ptr [rsp + 48]
+	movdqu xmm4,  xmmword ptr [rsp + 64]
+	movdqu xmm5,  xmmword ptr [rsp + 80]
+	movdqu xmm6,  xmmword ptr [rsp + 96]
+	movdqu xmm7,  xmmword ptr [rsp + 112]
+	movdqu xmm8,  xmmword ptr [rsp + 128]
+	movdqu xmm9,  xmmword ptr [rsp + 144]
+	movdqu xmm10, xmmword ptr [rsp + 160]
+	movdqu xmm11, xmmword ptr [rsp + 176]
+	movdqu xmm12, xmmword ptr [rsp + 192]
+	movdqu xmm13, xmmword ptr [rsp + 208]
+	movdqu xmm14, xmmword ptr [rsp + 224]
+	movdqu xmm15, xmmword ptr [rsp + 240]
+	vzeroupper
+	add rsp, 256
+
+	pop r15
+	pop r14
+	pop r13
+	pop r12
+	pop rsi
+	pop rdi
+	pop rbp
+	pop rbx
+	ret
--- a/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc
+++ b/src/crypto/randomx/asm/program_sshash_avx2_loop_begin.inc
@ -0,0 +1,37 @@
+	;# prefetch RandomX dataset lines
+	prefetchnta byte ptr [rsi]
+	prefetchnta byte ptr [rsi+64]
+	prefetchnta byte ptr [rsi+128]
+	prefetchnta byte ptr [rsi+192]
+	prefetchnta byte ptr [rsi+256]
+
+	;# prefetch RandomX cache lines
+	mov rbx, rbp
+	and rbx, RANDOMX_CACHE_MASK
+	shl rbx, 6
+	add rbx, rdi
+	prefetchnta byte ptr [rbx]
+	lea rax, [rbp+1]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	prefetchnta byte ptr [rax]
+	mov [rsp], rax
+	lea rax, [rbp+2]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	prefetchnta byte ptr [rax]
+	mov [rsp+8], rax
+	lea rax, [rbp+3]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	prefetchnta byte ptr [rax]
+	mov [rsp+16], rax
+	lea rax, [rbp+4]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	prefetchnta byte ptr [rax]
+	mov [rsp+24], rax
--- a/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc
+++ b/src/crypto/randomx/asm/program_sshash_avx2_loop_end.inc
@ -0,0 +1,38 @@
+	mov qword ptr [rsi+0], r8
+	vpunpcklqdq ymm8, ymm0, ymm1
+	mov qword ptr [rsi+8], r9
+	vpunpcklqdq ymm9, ymm2, ymm3
+	mov qword ptr [rsi+16], r10
+	vpunpcklqdq ymm10, ymm4, ymm5
+	mov qword ptr [rsi+24], r11
+	vpunpcklqdq ymm11, ymm6, ymm7
+	mov qword ptr [rsi+32], r12
+	vpunpckhqdq ymm12, ymm0, ymm1
+	mov qword ptr [rsi+40], r13
+	vpunpckhqdq ymm13, ymm2, ymm3
+	mov qword ptr [rsi+48], r14
+	vpunpckhqdq ymm14, ymm4, ymm5
+	mov qword ptr [rsi+56], r15
+	vpunpckhqdq ymm15, ymm6, ymm7
+
+	vperm2i128 ymm0, ymm8, ymm9, 32
+	vperm2i128 ymm1, ymm10, ymm11, 32
+	vmovdqu ymmword ptr [rsi+64], ymm0
+	vmovdqu ymmword ptr [rsi+96], ymm1
+	vperm2i128 ymm2, ymm12, ymm13, 32
+	vperm2i128 ymm3, ymm14, ymm15, 32
+	vmovdqu ymmword ptr [rsi+128], ymm2
+	vmovdqu ymmword ptr [rsi+160], ymm3
+	vperm2i128 ymm4, ymm8, ymm9, 49
+	vperm2i128 ymm5, ymm10, ymm11, 49
+	vmovdqu ymmword ptr [rsi+192], ymm4
+	vmovdqu ymmword ptr [rsi+224], ymm5
+	vperm2i128 ymm6, ymm12, ymm13, 49
+	vperm2i128 ymm7, ymm14, ymm15, 49
+	vmovdqu ymmword ptr [rsi+256], ymm6
+	vmovdqu ymmword ptr [rsi+288], ymm7
+
+	add rbp, 5
+	add rsi, 320
+	cmp rbp, qword ptr [rsp+40]
+	db 15, 130, 0, 0, 0, 0		;# jb rel32
--- a/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc
+++ b/src/crypto/randomx/asm/program_sshash_avx2_save_registers.inc
@ -0,0 +1,27 @@
+	push rbx
+	push rbp
+	push rdi
+	push rsi
+	push r12
+	push r13
+	push r14
+	push r15
+
+	;# save all XMM registers just to be safe for all calling conventions
+	sub rsp, 256
+	movdqu xmmword ptr [rsp], xmm0
+	movdqu xmmword ptr [rsp + 16], xmm1
+	movdqu xmmword ptr [rsp + 32], xmm2
+	movdqu xmmword ptr [rsp + 48], xmm3
+	movdqu xmmword ptr [rsp + 64], xmm4
+	movdqu xmmword ptr [rsp + 80], xmm5
+	movdqu xmmword ptr [rsp + 96], xmm6
+	movdqu xmmword ptr [rsp + 112], xmm7
+	movdqu xmmword ptr [rsp + 128], xmm8
+	movdqu xmmword ptr [rsp + 144], xmm9
+	movdqu xmmword ptr [rsp + 160], xmm10
+	movdqu xmmword ptr [rsp + 176], xmm11
+	movdqu xmmword ptr [rsp + 192], xmm12
+	movdqu xmmword ptr [rsp + 208], xmm13
+	movdqu xmmword ptr [rsp + 224], xmm14
+	movdqu xmmword ptr [rsp + 240], xmm15
--- a/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc
+++ b/src/crypto/randomx/asm/program_sshash_avx2_ssh_load.inc
@ -0,0 +1,50 @@
+	sub rsp, 40
+	mov [rsp], rbx
+	vmovdqu ymmword ptr [rsp+8], ymm14
+
+	mov rax, [rsp+40]
+	mov rbx, [rsp+48]
+	mov rcx, [rsp+56]
+	mov rdx, [rsp+64]
+
+	vmovdqu ymm8, ymmword ptr [rax]			;# ymm8  = r0[1], r1[1], r2[1], r3[1]
+	vmovdqu ymm9, ymmword ptr [rbx]			;# ymm9  = r0[2], r1[2], r2[2], r3[2]
+	vmovdqu ymm10, ymmword ptr [rcx]		;# ymm10 = r0[3], r1[3], r2[3], r3[3]
+	vmovdqu ymm11, ymmword ptr [rdx]		;# ymm11 = r0[4], r1[4], r2[4], r3[4]
+
+	vpunpcklqdq ymm12, ymm8, ymm9			;# ymm12 = r0[1], r0[2], r2[1], r2[2]
+	vpunpcklqdq ymm13, ymm10, ymm11			;# ymm13 = r0[3], r0[4], r2[3], r2[4]
+	vperm2i128 ymm14, ymm12, ymm13, 32		;# ymm14 = r0[1], r0[2], r0[3], r0[4]
+	vpxor ymm0, ymm0, ymm14
+	vperm2i128 ymm14, ymm12, ymm13, 49		;# ymm14 = r2[1], r2[2], r2[3], r2[4]
+	vpxor ymm2, ymm2, ymm14
+
+	vpunpckhqdq ymm12, ymm8, ymm9			;# ymm12 = r1[1], r1[2], r3[1], r3[2]
+	vpunpckhqdq ymm13, ymm10, ymm11			;# ymm13 = r1[3], r1[4], r3[3], r3[4]
+	vperm2i128 ymm14, ymm12, ymm13, 32		;# ymm14 = r1[1], r1[2], r1[3], r1[4]
+	vpxor ymm1, ymm1, ymm14
+	vperm2i128 ymm14, ymm12, ymm13, 49		;# ymm14 = r3[1], r3[2], r3[3], r3[4]
+	vpxor ymm3, ymm3, ymm14
+
+	vmovdqu ymm8, ymmword ptr [rax+32]		;# ymm8  = r4[1], r5[1], r6[1], r7[1]
+	vmovdqu ymm9, ymmword ptr [rbx+32]		;# ymm9  = r4[2], r5[2], r6[2], r7[2]
+	vmovdqu ymm10, ymmword ptr [rcx+32]		;# ymm10 = r4[3], r5[3], r6[3], r7[3]
+	vmovdqu ymm11, ymmword ptr [rdx+32]		;# ymm11 = r4[4], r5[4], r6[4], r7[4]
+
+	vpunpcklqdq ymm12, ymm8, ymm9			;# ymm12 = r4[1], r4[2], r6[1], r6[2]
+	vpunpcklqdq ymm13, ymm10, ymm11			;# ymm13 = r4[3], r4[4], r6[3], r6[4]
+	vperm2i128 ymm14, ymm12, ymm13, 32		;# ymm14 = r4[1], r4[2], r4[3], r4[4]
+	vpxor ymm4, ymm4, ymm14
+	vperm2i128 ymm14, ymm12, ymm13, 49		;# ymm14 = r6[1], r6[2], r6[3], r6[4]
+	vpxor ymm6, ymm6, ymm14
+
+	vpunpckhqdq ymm12, ymm8, ymm9			;# ymm12 = r5[1], r5[2], r7[1], r7[2]
+	vpunpckhqdq ymm13, ymm10, ymm11			;# ymm13 = r5[3], r5[4], r7[3], r7[4]
+	vperm2i128 ymm14, ymm12, ymm13, 32		;# ymm14 = r5[1], r5[2], r5[3], r5[4]
+	vpxor ymm5, ymm5, ymm14
+	vperm2i128 ymm14, ymm12, ymm13, 49		;# ymm14 = r7[1], r7[2], r7[3], r7[4]
+	vpxor ymm7, ymm7, ymm14
+
+	mov rbx, [rsp]
+	vmovdqu ymm14, ymmword ptr [rsp+8]
+	add rsp, 40
--- a/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc
+++ b/src/crypto/randomx/asm/program_sshash_avx2_ssh_prefetch.inc
@ -0,0 +1,29 @@
+	vmovdqu ymmword ptr [rsp], ymm0
+
+	mov rax, [rsp]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	mov [rsp], rax
+	prefetchnta byte ptr [rax]
+
+	mov rax, [rsp+8]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	mov [rsp+8], rax
+	prefetchnta byte ptr [rax]
+
+	mov rax, [rsp+16]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	mov [rsp+16], rax
+	prefetchnta byte ptr [rax]
+
+	mov rax, [rsp+24]
+	and rax, RANDOMX_CACHE_MASK
+	shl rax, 6
+	add rax, rdi
+	mov [rsp+24], rax
+	prefetchnta byte ptr [rax]
--- a/src/crypto/randomx/dataset.cpp
+++ b/src/crypto/randomx/dataset.cpp
@ -1,5 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -59,10 +61,11 @@ namespace randomx {

 	template<class Allocator>
 	void deallocCache(randomx_cache* cache) {
-		if (cache->memory != nullptr)
+		if (cache->memory != nullptr) {
 			Allocator::freeMemory(cache->memory, RANDOMX_CACHE_MAX_SIZE);
-		if (cache->jit != nullptr)
-			delete cache->jit;
+		}
+
+		delete cache->jit;
 	}

 	template void deallocCache<DefaultAllocator>(randomx_cache* cache);
@ -77,16 +80,16 @@ namespace randomx {
 		context.pwdlen = (uint32_t)keySize;
 		context.salt = CONST_CAST(uint8_t *)RandomX_CurrentConfig.ArgonSalt;
 		context.saltlen = (uint32_t)strlen(RandomX_CurrentConfig.ArgonSalt);
-		context.secret = NULL;
+		context.secret = nullptr;
 		context.secretlen = 0;
-		context.ad = NULL;
+		context.ad = nullptr;
 		context.adlen = 0;
 		context.t_cost = RandomX_CurrentConfig.ArgonIterations;
 		context.m_cost = RandomX_CurrentConfig.ArgonMemory;
 		context.lanes = RandomX_CurrentConfig.ArgonLanes;
 		context.threads = 1;
-		context.allocate_cbk = NULL;
-		context.free_cbk = NULL;
+		context.allocate_cbk = nullptr;
+		context.free_cbk = nullptr;
 		context.flags = ARGON2_DEFAULT_FLAGS;
 		context.version = ARGON2_VERSION_NUMBER;

@ -100,8 +103,18 @@ namespace randomx {

 	void initCacheCompile(randomx_cache* cache, const void* key, size_t keySize) {
 		initCache(cache, key, keySize);
+
+#		ifdef XMRIG_SECURE_JIT
+		cache->jit->enableWriting();
+#		endif
+
 		cache->jit->generateSuperscalarHash(cache->programs);
 		cache->jit->generateDatasetInitCode();
+		cache->datasetInit  = cache->jit->getDatasetInitFunc();
+
+#		ifdef XMRIG_SECURE_JIT
+		cache->jit->enableExecution();
+#		endif
 	}

 	constexpr uint64_t superscalarMul0 = 6364136223846793005ULL;
--- a/src/crypto/randomx/dataset.hpp
+++ b/src/crypto/randomx/dataset.hpp
@ -48,7 +48,7 @@ struct randomx_cache {
 	randomx::DatasetInitFunc* datasetInit;
 	randomx::SuperscalarProgram programs[RANDOMX_CACHE_MAX_ACCESSES];

-	bool isInitialized() {
+	bool isInitialized() const {
 		return programs[0].getSize() != 0;
 	}
 };
--- a/src/crypto/randomx/jit_compiler_a64.cpp
+++ b/src/crypto/randomx/jit_compiler_a64.cpp
@ -1,6 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
-Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -28,18 +29,25 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

 #include "crypto/randomx/jit_compiler_a64.hpp"
-#include "crypto/randomx/superscalar.hpp"
+#include "crypto/common/VirtualMemory.h"
 #include "crypto/randomx/program.hpp"
 #include "crypto/randomx/reciprocal.h"
+#include "crypto/randomx/superscalar.hpp"
 #include "crypto/randomx/virtual_memory.hpp"

 static bool hugePagesJIT = false;
+static int optimizedDatasetInit = -1;

 void randomx_set_huge_pages_jit(bool hugePages)
 {
 	hugePagesJIT = hugePages;
 }

+void randomx_set_optimized_dataset_init(int value)
+{
+	optimizedDatasetInit = value;
+}
+
 namespace ARMV8A {

 constexpr uint32_t B           = 0x14000000;
@ -96,37 +104,28 @@ static size_t CalcDatasetItemSize()

 constexpr uint32_t IntRegMap[8] = { 4, 5, 6, 7, 12, 13, 14, 15 };

-JitCompilerA64::JitCompilerA64(bool hugePagesEnable)
-	: code((uint8_t*) allocExecutableMemory(CodeSize + CalcDatasetItemSize(), hugePagesJIT && hugePagesEnable))
-	, literalPos(ImulRcpLiteralsEnd)
-	, num32bitLiterals(0)
+JitCompilerA64::JitCompilerA64(bool hugePagesEnable, bool) :
+	hugePages(hugePagesJIT && hugePagesEnable),
+	literalPos(ImulRcpLiteralsEnd)
 {
-	memset(reg_changed_offset, 0, sizeof(reg_changed_offset));
-	memcpy(code, (void*) randomx_program_aarch64, CodeSize);
 }

 JitCompilerA64::~JitCompilerA64()
 {
-	freePagedMemory(code, CodeSize + CalcDatasetItemSize());
-}
-
-#if defined(ios_HOST_OS) || defined (darwin_HOST_OS)
-void sys_icache_invalidate(void *start, size_t len);
-#endif
-
-static void clear_code_cache(char* p1, char* p2)
-{
-#	if defined(ios_HOST_OS) || defined (darwin_HOST_OS)
-	sys_icache_invalidate(p1, static_cast<size_t>(p2 - p1));
-#	elif defined (HAVE_BUILTIN_CLEAR_CACHE) || defined (__GNUC__)
-	__builtin___clear_cache(p1, p2);
-#	else
-#	error "No clear code cache function found"
-#	endif
+	freePagedMemory(code, allocatedSize);
 }

 void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& config, uint32_t)
 {
+	if (!allocatedSize) {
+		allocate(CodeSize);
+	}
+#ifdef XMRIG_SECURE_JIT
+	else {
+		enableWriting();
+	}
+#endif
+
 	uint32_t codePos = MainLoopBegin + 4;

 	// and w16, w10, ScratchpadL3Mask64
@ -171,11 +170,22 @@ void JitCompilerA64::generateProgram(Program& program, ProgramConfiguration& con
 	codePos = ((uint8_t*)randomx_program_aarch64_update_spMix1) - ((uint8_t*)randomx_program_aarch64);
 	emit32(ARMV8A::EOR | 10 | (IntRegMap[config.readReg0] << 5) | (IntRegMap[config.readReg1] << 16), code, codePos);

-	clear_code_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#	ifndef XMRIG_OS_APPLE
+	xmrig::VirtualMemory::flushInstructionCache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#	endif
 }

 void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration& config, uint32_t datasetOffset)
 {
+	if (!allocatedSize) {
+		allocate(CodeSize);
+	}
+#ifdef XMRIG_SECURE_JIT
+	else {
+		enableWriting();
+	}
+#endif
+
 	uint32_t codePos = MainLoopBegin + 4;

 	// and w16, w10, ScratchpadL3Mask64
@ -226,12 +236,23 @@ void JitCompilerA64::generateProgramLight(Program& program, ProgramConfiguration
 	emit32(ARMV8A::ADD_IMM_LO | 2 | (2 << 5) | (imm_lo << 10), code, codePos);
 	emit32(ARMV8A::ADD_IMM_HI | 2 | (2 << 5) | (imm_hi << 10), code, codePos);

-	clear_code_cache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#	ifndef XMRIG_OS_APPLE
+	xmrig::VirtualMemory::flushInstructionCache(reinterpret_cast<char*>(code + MainLoopBegin), reinterpret_cast<char*>(code + codePos));
+#	endif
 }

 template<size_t N>
 void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N])
 {
+	if (!allocatedSize) {
+		allocate(CodeSize + CalcDatasetItemSize());
+	}
+#ifdef XMRIG_SECURE_JIT
+	else {
+		enableWriting();
+	}
+#endif
+
 	uint32_t codePos = CodeSize;

 	uint8_t* p1 = (uint8_t*)randomx_calc_dataset_item_aarch64;
@ -342,13 +363,19 @@ void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[N])
 	memcpy(code + codePos, p1, p2 - p1);
 	codePos += p2 - p1;

-	clear_code_cache(reinterpret_cast<char*>(code + CodeSize), reinterpret_cast<char*>(code + codePos));
+#	ifndef XMRIG_OS_APPLE
+	xmrig::VirtualMemory::flushInstructionCache(reinterpret_cast<char*>(code + CodeSize), reinterpret_cast<char*>(code + codePos));
+#	endif
 }

 template void JitCompilerA64::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]);

-DatasetInitFunc* JitCompilerA64::getDatasetInitFunc()
+DatasetInitFunc* JitCompilerA64::getDatasetInitFunc() const
 {
+#	ifdef XMRIG_SECURE_JIT
+	enableExecution();
+#	endif
+
 	return (DatasetInitFunc*)(code + (((uint8_t*)randomx_init_dataset_aarch64) - ((uint8_t*)randomx_program_aarch64)));
 }

@ -357,6 +384,26 @@ size_t JitCompilerA64::getCodeSize()
 	return CodeSize;
 }

+void JitCompilerA64::enableWriting() const
+{
+	xmrig::VirtualMemory::protectRW(code, allocatedSize);
+}
+
+void JitCompilerA64::enableExecution() const
+{
+	xmrig::VirtualMemory::protectRX(code, allocatedSize);
+}
+
+
+void JitCompilerA64::allocate(size_t size)
+{
+	allocatedSize = size;
+	code = static_cast<uint8_t*>(allocExecutableMemory(allocatedSize, hugePages));
+
+	memcpy(code, reinterpret_cast<const void *>(randomx_program_aarch64), CodeSize);
+}
+
+
 void JitCompilerA64::emitMovImmediate(uint32_t dst, uint32_t imm, uint8_t* code, uint32_t& codePos)
 {
 	uint32_t k = codePos;
--- a/src/crypto/randomx/jit_compiler_a64.hpp
+++ b/src/crypto/randomx/jit_compiler_a64.hpp
@ -1,6 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
-Copyright (c) 2019, SChernykh    <https://github.com/SChernykh>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -46,7 +47,7 @@ namespace randomx {

 	class JitCompilerA64 {
 	public:
-		explicit JitCompilerA64(bool hugePagesEnable);
+		explicit JitCompilerA64(bool hugePagesEnable, bool optimizedInitDatasetEnable);
 		~JitCompilerA64();

 		void prepare() {}
@ -58,16 +59,32 @@ namespace randomx {

 		void generateDatasetInitCode() {}

-		ProgramFunc* getProgramFunc() { return reinterpret_cast<ProgramFunc*>(code); }
-		DatasetInitFunc* getDatasetInitFunc();
+		inline ProgramFunc *getProgramFunc() const {
+#			ifdef XMRIG_SECURE_JIT
+			enableExecution();
+#			endif
+
+			return reinterpret_cast<ProgramFunc*>(code);
+		}
+
+		DatasetInitFunc* getDatasetInitFunc() const;
 		uint8_t* getCode() { return code; }
 		size_t getCodeSize();

+		void enableWriting() const;
+		void enableExecution() const;
+
 		static InstructionGeneratorA64 engine[256];
-		uint32_t reg_changed_offset[8];
-		uint8_t* code;
+
+	private:
+		const bool hugePages;
+		uint32_t reg_changed_offset[8]{};
+		uint8_t* code = nullptr;
 		uint32_t literalPos;
-		uint32_t num32bitLiterals;
+		uint32_t num32bitLiterals = 0;
+		size_t allocatedSize = 0;
+
+		void allocate(size_t size);

 		static void emit32(uint32_t val, uint8_t* code, uint32_t& codePos)
 		{
@ -90,6 +107,7 @@ namespace randomx {
 		template<uint32_t tmp_reg_fp>
 		void emitMemLoadFP(uint32_t src, Instruction& instr, uint8_t* code, uint32_t& codePos);

+	public:
 		void h_IADD_RS(Instruction&, uint32_t&);
 		void h_IADD_M(Instruction&, uint32_t&);
 		void h_ISUB_R(Instruction&, uint32_t&);
--- a/src/crypto/randomx/jit_compiler_fallback.cpp
+++ b/src/crypto/randomx/jit_compiler_fallback.cpp
@ -35,3 +35,6 @@ void randomx_set_huge_pages_jit(bool)
 {
 }

+void randomx_set_optimized_dataset_init(int)
+{
+}
--- a/src/crypto/randomx/jit_compiler_fallback.hpp
+++ b/src/crypto/randomx/jit_compiler_fallback.hpp
@ -1,5 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -41,7 +43,7 @@ namespace randomx {

 	class JitCompilerFallback {
 	public:
-		explicit JitCompilerFallback(bool) {
+		explicit JitCompilerFallback(bool, bool) {
 			throw std::runtime_error("JIT compilation is not supported on this platform");
 		}
 		void prepare() {}
@ -70,5 +72,7 @@ namespace randomx {
 		size_t getCodeSize() {
 			return 0;
 		}
+		void enableWriting() {}
+		void enableExecution() {}
 	};
 }
--- a/src/crypto/randomx/jit_compiler_x86.cpp
+++ b/src/crypto/randomx/jit_compiler_x86.cpp
@ -1,5 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -30,14 +32,16 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <cstring>
 #include <climits>
 #include <atomic>
+
 #include "crypto/randomx/jit_compiler_x86.hpp"
+#include "backend/cpu/Cpu.h"
+#include "crypto/common/VirtualMemory.h"
 #include "crypto/randomx/jit_compiler_x86_static.hpp"
-#include "crypto/randomx/superscalar.hpp"
 #include "crypto/randomx/program.hpp"
 #include "crypto/randomx/reciprocal.h"
+#include "crypto/randomx/superscalar.hpp"
 #include "crypto/randomx/virtual_memory.hpp"
-#include "base/tools/Profiler.h"
-#include "backend/cpu/Cpu.h"
+#include "crypto/rx/Profiler.h"

 #ifdef XMRIG_FIX_RYZEN
 #   include "crypto/rx/Rx.h"
@ -45,17 +49,21 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifdef _MSC_VER
 #   include <intrin.h>
-#else
-#   include <cpuid.h>
 #endif

 static bool hugePagesJIT = false;
+static int optimizedDatasetInit = -1;

 void randomx_set_huge_pages_jit(bool hugePages)
 {
 	hugePagesJIT = hugePages;
 }

+void randomx_set_optimized_dataset_init(int value)
+{
+	optimizedDatasetInit = value;
+}
+
 namespace randomx {
 	/*

@ -112,6 +120,11 @@ namespace randomx {
 	#define codeReadDatasetLightSshInit ADDR(randomx_program_read_dataset_sshash_init)
 	#define codeReadDatasetLightSshFin ADDR(randomx_program_read_dataset_sshash_fin)
 	#define codeDatasetInit ADDR(randomx_dataset_init)
+	#define codeDatasetInitAVX2_prologue ADDR(randomx_dataset_init_avx2_prologue)
+	#define codeDatasetInitAVX2_loop_end ADDR(randomx_dataset_init_avx2_loop_end)
+	#define codeDatasetInitAVX2_loop_epilogue ADDR(randomx_dataset_init_avx2_epilogue)
+	#define codeDatasetInitAVX2_ssh_load ADDR(randomx_dataset_init_avx2_ssh_load)
+	#define codeDatasetInitAVX2_ssh_prefetch ADDR(randomx_dataset_init_avx2_ssh_prefetch)
 	#define codeLoopStore ADDR(randomx_program_loop_store)
 	#define codeLoopEnd ADDR(randomx_program_loop_end)
 	#define codeEpilogue ADDR(randomx_program_epilogue)
@ -128,7 +141,12 @@ namespace randomx {
 	#define readDatasetLightInitSize (codeReadDatasetLightSshFin - codeReadDatasetLightSshInit)
 	#define readDatasetLightFinSize (codeLoopStore - codeReadDatasetLightSshFin)
 	#define loopStoreSize (codeLoopEnd - codeLoopStore)
-	#define datasetInitSize (codeEpilogue - codeDatasetInit)
+	#define datasetInitSize (codeDatasetInitAVX2_prologue - codeDatasetInit)
+	#define datasetInitAVX2_prologue_size (codeDatasetInitAVX2_loop_end - codeDatasetInitAVX2_prologue)
+	#define datasetInitAVX2_loop_end_size (codeDatasetInitAVX2_loop_epilogue - codeDatasetInitAVX2_loop_end)
+	#define datasetInitAVX2_epilogue_size (codeDatasetInitAVX2_ssh_load - codeDatasetInitAVX2_loop_epilogue)
+	#define datasetInitAVX2_ssh_load_size (codeDatasetInitAVX2_ssh_prefetch - codeDatasetInitAVX2_ssh_load)
+	#define datasetInitAVX2_ssh_prefetch_size (codeEpilogue - codeDatasetInitAVX2_ssh_prefetch)
 	#define epilogueSize (codeShhLoad - codeEpilogue)
 	#define codeSshLoadSize (codeShhPrefetch - codeShhLoad)
 	#define codeSshPrefetchSize (codeShhEnd - codeShhPrefetch)
@ -166,20 +184,27 @@ namespace randomx {
 		{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
 	};

+	static inline uint8_t* alignToPage(uint8_t* p, size_t pageSize) {
+		size_t k = (size_t) p;
+		k -= k % pageSize;
+		return (uint8_t*) k;
+	}
+
 	size_t JitCompilerX86::getCodeSize() {
 		return codePos < prologueSize ? 0 : codePos - prologueSize;
 	}

-    static inline void cpuid(uint32_t level, int32_t output[4])
-    {
-        memset(output, 0, sizeof(int32_t) * 4);
+	void JitCompilerX86::enableWriting() const {
+		uint8_t* p1 = alignToPage(code, 4096);
+		uint8_t* p2 = code + CodeSize;
+		xmrig::VirtualMemory::protectRW(p1, p2 - p1);
+	}

-#   ifdef _MSC_VER
-        __cpuid(output, static_cast<int>(level));
-#   else
-        __cpuid_count(level, 0, output[0], output[1], output[2], output[3]);
-#   endif
-    }
+	void JitCompilerX86::enableExecution() const {
+		uint8_t* p1 = alignToPage(code, 4096);
+		uint8_t* p2 = code + CodeSize;
+		xmrig::VirtualMemory::protectRX(p1, p2 - p1);
+	}

 #	ifdef _MSC_VER
 	static FORCE_INLINE uint32_t rotl32(uint32_t a, int shift) { return _rotl(a, shift); }
@ -190,17 +215,68 @@ namespace randomx {
 	static std::atomic<size_t> codeOffset;
 	constexpr size_t codeOffsetIncrement = 59 * 64;

-	JitCompilerX86::JitCompilerX86(bool hugePagesEnable) {
+	JitCompilerX86::JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable) {
 		BranchesWithin32B = xmrig::Cpu::info()->jccErratum();

-		int32_t info[4];
-		cpuid(1, info);
-		hasAVX = ((info[2] & (1 << 27)) != 0) && ((info[2] & (1 << 28)) != 0);
+		hasAVX = xmrig::Cpu::info()->hasAVX();
+		hasAVX2 = xmrig::Cpu::info()->hasAVX2();

-		cpuid(0x80000001, info);
-		hasXOP = ((info[2] & (1 << 11)) != 0);
+		// Disable by default
+		initDatasetAVX2 = false;

-		allocatedCode = (uint8_t*)allocExecutableMemory(CodeSize * 2, hugePagesJIT && hugePagesEnable);
+		if (optimizedInitDatasetEnable) {
+			// Dataset init using AVX2:
+			// -1 = Auto detect
+			//  0 = Always disabled
+			// +1 = Always enabled
+			if (optimizedDatasetInit > 0) {
+				initDatasetAVX2 = true;
+			}
+			else if (optimizedDatasetInit < 0) {
+				xmrig::ICpuInfo::Vendor vendor = xmrig::Cpu::info()->vendor();
+				xmrig::ICpuInfo::Arch arch = xmrig::Cpu::info()->arch();
+
+				if (vendor == xmrig::ICpuInfo::VENDOR_INTEL) {
+					// AVX2 init is faster on Intel CPUs without HT
+					initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads());
+				}
+				else if (vendor == xmrig::ICpuInfo::VENDOR_AMD) {
+					switch (arch) {
+					case xmrig::ICpuInfo::ARCH_ZEN:
+					case xmrig::ICpuInfo::ARCH_ZEN_PLUS:
+					default:
+						// AVX2 init is slower on Zen/Zen+
+						// Also disable it for other unknown architectures
+						initDatasetAVX2 = false;
+						break;
+					case xmrig::ICpuInfo::ARCH_ZEN2:
+						// AVX2 init is faster on Zen2 without SMT (mobile CPUs)
+						initDatasetAVX2 = (xmrig::Cpu::info()->cores() == xmrig::Cpu::info()->threads());
+						break;
+					case xmrig::ICpuInfo::ARCH_ZEN3:
+						// AVX2 init is faster on Zen3
+						initDatasetAVX2 = true;
+						break;
+					}
+				}
+			}
+		}
+
+		// Sorry, low-end Intel CPUs
+		if (!hasAVX2) {
+			initDatasetAVX2 = false;
+		}
+
+		hasXOP = xmrig::Cpu::info()->hasXOP();
+
+		allocatedSize = initDatasetAVX2 ? (CodeSize * 4) : (CodeSize * 2);
+		allocatedCode = static_cast<uint8_t*>(allocExecutableMemory(allocatedSize,
+#			ifdef XMRIG_SECURE_JIT
+			false
+#			else
+			hugePagesJIT && hugePagesEnable
+#			endif
+		));

 		// Shift code base address to improve caching - all threads will use different L2/L3 cache sets
 		code = allocatedCode + (codeOffset.fetch_add(codeOffsetIncrement) % CodeSize);
@ -224,7 +300,7 @@ namespace randomx {

 	JitCompilerX86::~JitCompilerX86() {
 		codeOffset.fetch_sub(codeOffsetIncrement);
-		freePagedMemory(allocatedCode, CodeSize);
+		freePagedMemory(allocatedCode, allocatedSize);
 	}

 	void JitCompilerX86::prepare() {
@ -237,6 +313,10 @@ namespace randomx {
 	void JitCompilerX86::generateProgram(Program& prog, ProgramConfiguration& pcfg, uint32_t flags) {
 		PROFILE_SCOPE(RandomX_JIT_compile);

+#		ifdef XMRIG_SECURE_JIT
+		enableWriting();
+#		endif
+
 		vm_flags = flags;

 		generateProgramPrologue(prog, pcfg);
@ -271,14 +351,49 @@ namespace randomx {

 	template<size_t N>
 	void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[N]) {
+		uint8_t* p = code;
+		if (initDatasetAVX2) {
+			codePos = 0;
+			emit(codeDatasetInitAVX2_prologue, datasetInitAVX2_prologue_size, code, codePos);
+
+			for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
+				SuperscalarProgram& prog = programs[j];
+				uint32_t pos = codePos;
+				for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
+					generateSuperscalarCode<true>(prog(i), p, pos);
+				}
+				codePos = pos;
+				emit(codeShhLoad, codeSshLoadSize, code, codePos);
+				emit(codeDatasetInitAVX2_ssh_load, datasetInitAVX2_ssh_load_size, code, codePos);
+				if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
+					*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
+					codePos += 3;
+					emit(RandomX_CurrentConfig.codeShhPrefetchTweaked, codeSshPrefetchSize, code, codePos);
+					uint8_t* p = code + codePos;
+					emit(codeDatasetInitAVX2_ssh_prefetch, datasetInitAVX2_ssh_prefetch_size, code, codePos);
+					p[3] += prog.getAddressRegister() << 3;
+				}
+			}
+
+			emit(codeDatasetInitAVX2_loop_end, datasetInitAVX2_loop_end_size, code, codePos);
+
+			// Number of bytes from the start of randomx_dataset_init_avx2_prologue to loop_begin label
+			constexpr int32_t prologue_size = 320;
+			*(int32_t*)(code + codePos - 4) = prologue_size - codePos;
+
+			emit(codeDatasetInitAVX2_loop_epilogue, datasetInitAVX2_epilogue_size, code, codePos);
+			return;
+		}
+
 		memcpy(code + superScalarHashOffset, codeShhInit, codeSshInitSize);
 		codePos = superScalarHashOffset + codeSshInitSize;
 		for (unsigned j = 0; j < RandomX_CurrentConfig.CacheAccesses; ++j) {
 			SuperscalarProgram& prog = programs[j];
-			for (unsigned i = 0; i < prog.getSize(); ++i) {
-				Instruction& instr = prog(i);
-				generateSuperscalarCode(instr);
+			uint32_t pos = codePos;
+			for (uint32_t i = 0, n = prog.getSize(); i < n; ++i) {
+				generateSuperscalarCode<false>(prog(i), p, pos);
 			}
+			codePos = pos;
 			emit(codeShhLoad, codeSshLoadSize, code, codePos);
 			if (j < RandomX_CurrentConfig.CacheAccesses - 1) {
 				*(uint32_t*)(code + codePos) = 0xd88b49 + (static_cast<uint32_t>(prog.getAddressRegister()) << 16);
@ -293,7 +408,10 @@ namespace randomx {
 	void JitCompilerX86::generateSuperscalarHash(SuperscalarProgram(&programs)[RANDOMX_CACHE_MAX_ACCESSES]);

 	void JitCompilerX86::generateDatasetInitCode() {
-		memcpy(code, codeDatasetInit, datasetInitSize);
+		// AVX2 code is generated in generateSuperscalarHash()
+		if (!initDatasetAVX2) {
+			memcpy(code, codeDatasetInit, datasetInitSize);
+		}
 	}

 	void JitCompilerX86::generateProgramPrologue(Program& prog, ProgramConfiguration& pcfg) {
@ -372,101 +490,243 @@ namespace randomx {
 		emit32(epilogueOffset - codePos - 4, code, codePos);
 	}

-	void JitCompilerX86::generateSuperscalarCode(Instruction& instr) {
-		static constexpr uint8_t REX_SUB_RR[] = { 0x4d, 0x2b };
-		static constexpr uint8_t REX_MOV_RR64[] = { 0x49, 0x8b };
-		static constexpr uint8_t REX_MOV_R64R[] = { 0x4c, 0x8b };
-		static constexpr uint8_t REX_IMUL_RR[] = { 0x4d, 0x0f, 0xaf };
-		static constexpr uint8_t REX_IMUL_RM[] = { 0x4c, 0x0f, 0xaf };
-		static constexpr uint8_t REX_MUL_R[] = { 0x49, 0xf7 };
-		static constexpr uint8_t REX_81[] = { 0x49, 0x81 };
-		static constexpr uint8_t MOV_RAX_I[] = { 0x48, 0xb8 };
-		static constexpr uint8_t REX_LEA[] = { 0x4f, 0x8d };
-		static constexpr uint8_t REX_XOR_RR[] = { 0x4D, 0x33 };
-		static constexpr uint8_t REX_XOR_RI[] = { 0x49, 0x81 };
-		static constexpr uint8_t REX_ROT_I8[] = { 0x49, 0xc1 };
-
+	template<bool AVX2>
+	FORCE_INLINE void JitCompilerX86::generateSuperscalarCode(Instruction& instr, uint8_t* code, uint32_t& codePos) {
 		switch ((SuperscalarInstructionType)instr.opcode)
 		{
 		case randomx::SuperscalarInstructionType::ISUB_R:
-			emit(REX_SUB_RR, code, codePos);
-			emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
+			*(uint32_t*)(code + codePos) = 0x00C02B4DUL + (instr.dst << 19) + (instr.src << 16);
+			codePos += 3;
+			if (AVX2) {
+				emit32(0xC0FBFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IXOR_R:
-			emit(REX_XOR_RR, code, codePos);
-			emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
+			*(uint32_t*)(code + codePos) = 0x00C0334DUL + (instr.dst << 19) + (instr.src << 16);
+			codePos += 3;
+			if (AVX2) {
+				emit32(0xC0EFFDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IADD_RS:
-			emit(REX_LEA, code, codePos);
-			emitByte(0x04 + 8 * instr.dst, code, codePos);
-			genSIB(instr.getModShift(), instr.src, instr.dst, code, codePos);
+			emit32(0x00048D4F + (instr.dst << 19) + (genSIB(instr.getModShift(), instr.src, instr.dst) << 24), code, codePos);
+			if (AVX2) {
+				if (instr.getModShift()) {
+					static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xF0, 0x00, 0xC5, 0xBD, 0xD4, 0xC0 };
+					uint8_t* p = code + codePos;
+					emit(t, code, codePos);
+					p[3] += instr.src;
+					p[4] = instr.getModShift();
+					p[8] += instr.dst * 9;
+				}
+				else {
+					emit32(0xC0D4FDC5UL + (instr.src << 24) + (instr.dst << 27) - (instr.dst << 11), code, codePos);
+				}
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IMUL_R:
-			emit(REX_IMUL_RR, code, codePos);
-			emitByte(0xc0 + 8 * instr.dst + instr.src, code, codePos);
+			emit32(0xC0AF0F4DUL + (instr.dst << 27) + (instr.src << 24), code, codePos);
+			if (AVX2) {
+				static const uint8_t t[] = {
+					0xC5, 0xBD, 0x73, 0xD0, 0x20,
+					0xC5, 0xB5, 0x73, 0xD0, 0x20,
+					0xC5, 0x7D, 0xF4, 0xD0,
+					0xC5, 0x35, 0xF4, 0xD8,
+					0xC5, 0xBD, 0xF4, 0xC0,
+					0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
+					0xC5, 0xFD, 0x73, 0xF0, 0x20,
+					0xC4, 0x41, 0x2D, 0xD4, 0xD3,
+					0xC5, 0xAD, 0xD4, 0xC0
+				};
+				uint8_t* p = code + codePos;
+				emit(t, code, codePos);
+				p[3] += instr.dst;
+				p[8] += instr.src;
+				p[11] -= instr.dst * 8;
+				p[13] += instr.src;
+				p[17] += instr.dst;
+				p[21] += instr.dst * 8 + instr.src;
+				p[29] -= instr.dst * 8;
+				p[31] += instr.dst;
+				p[41] += instr.dst * 9;
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IROR_C:
-			emit(REX_ROT_I8, code, codePos);
-			emitByte(0xc8 + instr.dst, code, codePos);
-			emitByte(instr.getImm32() & 63, code, codePos);
+			{
+				const uint32_t shift = instr.getImm32() & 63;
+				emit32(0x00C8C149UL + (instr.dst << 16) + (shift << 24), code, codePos);
+				if (AVX2) {
+					static const uint8_t t[] = { 0xC5, 0xBD, 0x73, 0xD0, 0x00, 0xC5, 0xB5, 0x73, 0xF0, 0x00, 0xC4, 0xC1, 0x3D, 0xEB, 0xC1 };
+					uint8_t* p = code + codePos;
+					emit(t, code, codePos);
+					p[3] += instr.dst;
+					p[4] = shift;
+					p[8] += instr.dst;
+					p[9] = 64 - shift;
+					p[14] += instr.dst * 8;
+				}
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IADD_C7:
-			emit(REX_81, code, codePos);
-			emitByte(0xc0 + instr.dst, code, codePos);
-			emit32(instr.getImm32(), code, codePos);
+		case randomx::SuperscalarInstructionType::IADD_C8:
+		case randomx::SuperscalarInstructionType::IADD_C9:
+			if (AVX2) {
+				static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x03, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xD4, 0xC0 };
+				uint8_t* p = code + codePos;
+				emit(t, code, codePos);
+				*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
+				p[12] += instr.dst * 8;
+				p[24] -= instr.dst * 8;
+				p[26] += instr.dst * 8;
+			}
+			else {
+				*(uint32_t*)(code + codePos) = 0x00C08149UL + (instr.dst << 16);
+				codePos += 3;
+				emit32(instr.getImm32(), code, codePos);
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IXOR_C7:
-			emit(REX_XOR_RI, code, codePos);
-			emitByte(0xf0 + instr.dst, code, codePos);
-			emit32(instr.getImm32(), code, codePos);
-			break;
-		case randomx::SuperscalarInstructionType::IADD_C8:
-			emit(REX_81, code, codePos);
-			emitByte(0xc0 + instr.dst, code, codePos);
-			emit32(instr.getImm32(), code, codePos);
-			break;
 		case randomx::SuperscalarInstructionType::IXOR_C8:
-			emit(REX_XOR_RI, code, codePos);
-			emitByte(0xf0 + instr.dst, code, codePos);
-			emit32(instr.getImm32(), code, codePos);
-			break;
-		case randomx::SuperscalarInstructionType::IADD_C9:
-			emit(REX_81, code, codePos);
-			emitByte(0xc0 + instr.dst, code, codePos);
-			emit32(instr.getImm32(), code, codePos);
-			break;
 		case randomx::SuperscalarInstructionType::IXOR_C9:
-			emit(REX_XOR_RI, code, codePos);
-			emitByte(0xf0 + instr.dst, code, codePos);
-			emit32(instr.getImm32(), code, codePos);
+			if (AVX2) {
+				static const uint8_t t[] = { 0x48, 0xB8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x4C, 0x33, 0xC0, 0xC4, 0x62, 0x7D, 0x19, 0x05, 0xEC, 0xFF, 0xFF, 0xFF, 0xC4, 0xC1, 0x7D, 0xEF, 0xC0 };
+				uint8_t* p = code + codePos;
+				emit(t, code, codePos);
+				*(uint64_t*)(p + 2) = signExtend2sCompl(instr.getImm32());
+				p[12] += instr.dst * 8;
+				p[24] -= instr.dst * 8;
+				p[26] += instr.dst * 8;
+			}
+			else {
+				*(uint32_t*)(code + codePos) = 0x00F08149UL + (instr.dst << 16);
+				codePos += 3;
+				emit32(instr.getImm32(), code, codePos);
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IMULH_R:
-			emit(REX_MOV_RR64, code, codePos);
-			emitByte(0xc0 + instr.dst, code, codePos);
-			emit(REX_MUL_R, code, codePos);
-			emitByte(0xe0 + instr.src, code, codePos);
-			emit(REX_MOV_R64R, code, codePos);
-			emitByte(0xc2 + 8 * instr.dst, code, codePos);
+			*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
+			codePos += 3;
+			*(uint32_t*)(code + codePos) = 0x00E0F749UL + (instr.src << 16);
+			codePos += 3;
+			*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
+			codePos += 3;
+			if (AVX2) {
+				static const uint8_t t[] = {
+					0xC5, 0xBD, 0x73, 0xD0, 0x20,
+					0xC5, 0xB5, 0x73, 0xD0, 0x20,
+					0xC5, 0x7D, 0xF4, 0xD0,
+					0xC5, 0x3D, 0xF4, 0xD8,
+					0xC4, 0x41, 0x7D, 0xF4, 0xE1,
+					0xC4, 0xC1, 0x3D, 0xF4, 0xC1,
+					0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
+					0xC4, 0x41, 0x25, 0xEF, 0xC6,
+					0xC4, 0x41, 0x25, 0xD4, 0xDC,
+					0xC4, 0x41, 0x25, 0xD4, 0xDA,
+					0xC4, 0x41, 0x25, 0xEF, 0xCE,
+					0xC4, 0x42, 0x3D, 0x37, 0xC1,
+					0xC4, 0x41, 0x3D, 0xDB, 0xC7,
+					0xC5, 0xBD, 0xD4, 0xC0,
+					0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
+					0xC5, 0xA5, 0xD4, 0xC0
+				};
+				uint8_t* p = code + codePos;
+				emit(t, code, codePos);
+				p[3] += instr.dst;
+				p[8] += instr.src;
+				p[11] -= instr.dst * 8;
+				p[13] += instr.src;
+				p[17] += instr.src;
+				p[20] -= instr.dst * 8;
+				p[27] += instr.dst * 8;
+				p[67] += instr.dst * 9;
+				p[77] += instr.dst * 9;
+			}
 			break;
 		case randomx::SuperscalarInstructionType::ISMULH_R:
-			emit(REX_MOV_RR64, code, codePos);
-			emitByte(0xc0 + instr.dst, code, codePos);
-			emit(REX_MUL_R, code, codePos);
-			emitByte(0xe8 + instr.src, code, codePos);
-			emit(REX_MOV_R64R, code, codePos);
-			emitByte(0xc2 + 8 * instr.dst, code, codePos);
+			*(uint32_t*)(code + codePos) = 0x00C08B49UL + (instr.dst << 16);
+			codePos += 3;
+			*(uint32_t*)(code + codePos) = 0x00E8F749UL + (instr.src << 16);
+			codePos += 3;
+			*(uint32_t*)(code + codePos) = 0x00C28B4CUL + (instr.dst << 19);
+			codePos += 3;
+			if (AVX2) {
+				static const uint8_t t[] = {
+					0xC5, 0xBD, 0x73, 0xD0, 0x20,
+					0xC5, 0xB5, 0x73, 0xD0, 0x20,
+					0xC5, 0x7D, 0xF4, 0xD0,
+					0xC5, 0x3D, 0xF4, 0xD8,
+					0xC4, 0x41, 0x7D, 0xF4, 0xE1,
+					0xC4, 0x41, 0x3D, 0xF4, 0xE9,
+					0xC4, 0xC1, 0x2D, 0x73, 0xD2, 0x20,
+					0xC4, 0x41, 0x25, 0xEF, 0xC6,
+					0xC4, 0x41, 0x25, 0xD4, 0xDC,
+					0xC4, 0x41, 0x25, 0xD4, 0xDA,
+					0xC4, 0x41, 0x25, 0xEF, 0xCE,
+					0xC4, 0x42, 0x3D, 0x37, 0xC1,
+					0xC4, 0x41, 0x3D, 0xDB, 0xC7,
+					0xC4, 0x41, 0x15, 0xD4, 0xE8,
+					0xC4, 0xC1, 0x25, 0x73, 0xD3, 0x20,
+					0xC4, 0x41, 0x15, 0xD4, 0xC3,
+					0xC4, 0x41, 0x35, 0xEF, 0xC9,
+					0xC4, 0x62, 0x35, 0x37, 0xD0,
+					0xC4, 0x62, 0x35, 0x37, 0xD8,
+					0xC5, 0x2D, 0xDB, 0xD0,
+					0xC5, 0x25, 0xDB, 0xD8,
+					0xC4, 0x41, 0x3D, 0xFB, 0xC2,
+					0xC4, 0xC1, 0x3D, 0xFB, 0xC3
+				};
+				uint8_t* p = code + codePos;
+				emit(t, code, codePos);
+				p[3] += instr.dst;
+				p[8] += instr.src;
+				p[11] -= instr.dst * 8;
+				p[13] += instr.src;
+				p[17] += instr.src;
+				p[20] -= instr.dst * 8;
+				p[89] += instr.dst;
+				p[94] += instr.src;
+				p[98] += instr.src;
+				p[102] += instr.dst;
+				p[112] += instr.dst * 8;
+			}
 			break;
 		case randomx::SuperscalarInstructionType::IMUL_RCP:
-			emit(MOV_RAX_I, code, codePos);
+			*(uint32_t*)(code + codePos) = 0x0000B848UL;
+			codePos += 2;
 			emit64(randomx_reciprocal_fast(instr.getImm32()), code, codePos);
-			emit(REX_IMUL_RM, code, codePos);
-			emitByte(0xc0 + 8 * instr.dst, code, codePos);
+			emit32(0xC0AF0F4CUL + (instr.dst << 27), code, codePos);
+			if (AVX2) {
+				static const uint8_t t[] = {
+					0xC4, 0x62, 0x7D, 0x19, 0x25, 0xEB, 0xFF, 0xFF, 0xFF,
+					0xC5, 0xBD, 0x73, 0xD0, 0x20,
+					0xC4, 0xC1, 0x35, 0x73, 0xD4, 0x20,
+					0xC4, 0x41, 0x7D, 0xF4, 0xD4,
+					0xC5, 0x35, 0xF4, 0xD8,
+					0xC4, 0xC1, 0x3D, 0xF4, 0xC4,
+					0xC4, 0xC1, 0x25, 0x73, 0xF3, 0x20,
+					0xC5, 0xFD, 0x73, 0xF0, 0x20,
+					0xC4, 0x41, 0x2D, 0xD4, 0xD3,
+					0xC5, 0xAD, 0xD4, 0xC0
+				};
+				uint8_t* p = code + codePos;
+				emit(t, code, codePos);
+				p[12] += instr.dst;
+				p[22] -= instr.dst * 8;
+				p[28] += instr.dst;
+				p[33] += instr.dst * 8;
+				p[41] -= instr.dst * 8;
+				p[43] += instr.dst;
+				p[53] += instr.dst * 9;
+			}
 			break;
 		default:
 			UNREACHABLE;
 		}
 	}

+	template void JitCompilerX86::generateSuperscalarCode<false>(Instruction&, uint8_t*, uint32_t&);
+	template void JitCompilerX86::generateSuperscalarCode<true>(Instruction&, uint8_t*, uint32_t&);
+
 	template<bool rax>
 	FORCE_INLINE void JitCompilerX86::genAddressReg(const Instruction& instr, const uint32_t src, uint8_t* code, uint32_t& codePos) {
 		*(uint32_t*)(code + codePos) = (rax ? 0x24808d41 : 0x24888d41) + (src << 16);
@ -546,10 +806,6 @@ namespace randomx {
 		codePos = pos;
 	}

-	void JitCompilerX86::genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos) {
-		emitByte((scale << 6) | (index << 3) | base, code, codePos);
-	}
-
 	void JitCompilerX86::h_ISUB_R(const Instruction& instr) {
 		uint8_t* const p = code;
 		uint32_t pos = codePos;
@ -1088,11 +1344,11 @@ namespace randomx {
 		pos += 14;

 		if (jmp_offset >= -128) {
-			*(uint32_t*)(p + pos) = 0x74 + (jmp_offset << 8);
+			*(uint32_t*)(p + pos) = 0x74 + (static_cast<uint32_t>(jmp_offset) << 8);
 			pos += 2;
 		}
 		else {
-			*(uint64_t*)(p + pos) = 0x840f + ((static_cast<int64_t>(jmp_offset) - 4) << 16);
+			*(uint64_t*)(p + pos) = 0x840f + (static_cast<uint64_t>(jmp_offset - 4) << 16);
 			pos += 6;
 		}

--- a/src/crypto/randomx/jit_compiler_x86.hpp
+++ b/src/crypto/randomx/jit_compiler_x86.hpp
@ -1,5 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -47,7 +49,7 @@ namespace randomx {

 	class JitCompilerX86 {
 	public:
-		explicit JitCompilerX86(bool hugePagesEnable);
+		explicit JitCompilerX86(bool hugePagesEnable, bool optimizedInitDatasetEnable);
 		~JitCompilerX86();
 		void prepare();
 		void generateProgram(Program&, ProgramConfiguration&, uint32_t);
@ -55,24 +57,38 @@ namespace randomx {
 		template<size_t N>
 		void generateSuperscalarHash(SuperscalarProgram (&programs)[N]);
 		void generateDatasetInitCode();
-		ProgramFunc* getProgramFunc() {
-			return (ProgramFunc*)code;
+
+		inline ProgramFunc *getProgramFunc() const {
+#			ifdef XMRIG_SECURE_JIT
+			enableExecution();
+#			endif
+
+			return reinterpret_cast<ProgramFunc*>(code);
 		}
-		DatasetInitFunc* getDatasetInitFunc() {
+
+		inline DatasetInitFunc *getDatasetInitFunc() const {
+# 			ifdef XMRIG_SECURE_JIT
+			enableExecution();
+#			endif
+
 			return (DatasetInitFunc*)code;
 		}
+
 		uint8_t* getCode() {
 			return code;
 		}
 		size_t getCodeSize();
+		void enableWriting() const;
+		void enableExecution() const;

 		alignas(64) static InstructionGeneratorX86 engine[256];

-		int registerUsage[RegistersCount];
-		uint8_t* code;
-		uint32_t codePos;
-		uint32_t codePosFirst;
-		uint32_t vm_flags;
+	private:
+		int registerUsage[RegistersCount] = {};
+		uint8_t* code = nullptr;
+		uint32_t codePos = 0;
+		uint32_t codePosFirst = 0;
+		uint32_t vm_flags = 0;

 #		ifdef XMRIG_FIX_RYZEN
 		std::pair<const void*, const void*> mainLoopBounds;
@ -80,9 +96,12 @@ namespace randomx {

 		bool BranchesWithin32B = false;
 		bool hasAVX;
+		bool hasAVX2;
+		bool initDatasetAVX2;
 		bool hasXOP;

-		uint8_t* allocatedCode;
+		uint8_t* allocatedCode = nullptr;
+		size_t allocatedSize = 0;

 		void generateProgramPrologue(Program&, ProgramConfiguration&);
 		void generateProgramEpilogue(Program&, ProgramConfiguration&);
@ -90,9 +109,10 @@ namespace randomx {
 		static void genAddressReg(const Instruction&, const uint32_t src, uint8_t* code, uint32_t& codePos);
 		static void genAddressRegDst(const Instruction&, uint8_t* code, uint32_t& codePos);
 		static void genAddressImm(const Instruction&, uint8_t* code, uint32_t& codePos);
-		static void genSIB(int scale, int index, int base, uint8_t* code, uint32_t& codePos);
+		static uint32_t genSIB(int scale, int index, int base) { return (scale << 6) | (index << 3) | base; }

-		void generateSuperscalarCode(Instruction &);
+		template<bool AVX2>
+		void generateSuperscalarCode(Instruction& inst, uint8_t* code, uint32_t& codePos);

 		static void emitByte(uint8_t val, uint8_t* code, uint32_t& codePos) {
 			code[codePos] = val;
@ -119,6 +139,7 @@ namespace randomx {
 			codePos += count;
 		}

+	public:
 		void h_IADD_RS(const Instruction&);
 		void h_IADD_M(const Instruction&);
 		void h_ISUB_R(const Instruction&);
--- a/src/crypto/randomx/jit_compiler_x86_static.S
+++ b/src/crypto/randomx/jit_compiler_x86_static.S
@ -52,6 +52,11 @@
 .global DECL(randomx_program_loop_store)
 .global DECL(randomx_program_loop_end)
 .global DECL(randomx_dataset_init)
+.global DECL(randomx_dataset_init_avx2_prologue)
+.global DECL(randomx_dataset_init_avx2_loop_end)
+.global DECL(randomx_dataset_init_avx2_epilogue)
+.global DECL(randomx_dataset_init_avx2_ssh_load)
+.global DECL(randomx_dataset_init_avx2_ssh_prefetch)
 .global DECL(randomx_program_epilogue)
 .global DECL(randomx_sshash_load)
 .global DECL(randomx_sshash_prefetch)
@ -192,6 +197,97 @@ call_offset:
 	pop rbx
 	ret

+.balign 64
+DECL(randomx_dataset_init_avx2_prologue):
+	#include "asm/program_sshash_avx2_save_registers.inc"
+
+#if defined(WINABI)
+	mov rdi, qword ptr [rcx] ;# cache->memory
+	mov rsi, rdx ;# dataset
+	mov rbp, r8  ;# block index
+	push r9      ;# max. block index
+#else
+	mov rdi, qword ptr [rdi] ;# cache->memory
+	;# dataset in rsi
+	mov rbp, rdx  ;# block index
+	push rcx      ;# max. block index
+#endif
+	sub rsp, 40
+
+	jmp randomx_dataset_init_avx2_prologue_loop_begin
+	#include "asm/program_sshash_avx2_constants.inc"
+
+.balign 64
+randomx_dataset_init_avx2_prologue_loop_begin:
+	#include "asm/program_sshash_avx2_loop_begin.inc"
+
+	;# init integer registers (lane 0)
+	lea r8, [rbp+1]
+	imul r8, qword ptr [r0_avx2_mul+rip]
+	mov r9, qword ptr [r1_avx2_add+rip]
+	xor r9, r8
+	mov r10, qword ptr [r2_avx2_add+rip]
+	xor r10, r8
+	mov r11, qword ptr [r3_avx2_add+rip]
+	xor r11, r8
+	mov r12, qword ptr [r4_avx2_add+rip]
+	xor r12, r8
+	mov r13, qword ptr [r5_avx2_add+rip]
+	xor r13, r8
+	mov r14, qword ptr [r6_avx2_add+rip]
+	xor r14, r8
+	mov r15, qword ptr [r7_avx2_add+rip]
+	xor r15, r8
+
+	;# init AVX registers (lanes 1-4)
+	mov qword ptr [rsp+32], rbp
+	vbroadcastsd ymm0, qword ptr [rsp+32]
+	vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip]
+
+	;# ymm0 *= r0_avx2_mul
+	vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip]
+	vpsrlq ymm8, ymm0, 32
+	vpsrlq ymm9, ymm1, 32
+	vpmuludq ymm10, ymm0, ymm1
+	vpmuludq ymm11, ymm9, ymm0
+	vpmuludq ymm0, ymm8, ymm1
+	vpsllq ymm11, ymm11, 32
+	vpsllq ymm0, ymm0, 32
+	vpaddq ymm10, ymm10, ymm11
+	vpaddq ymm0, ymm10, ymm0
+
+	vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip]
+	vpxor ymm1, ymm0, ymm1
+	vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip]
+	vpxor ymm2, ymm0, ymm2
+	vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip]
+	vpxor ymm3, ymm0, ymm3
+	vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip]
+	vpxor ymm4, ymm0, ymm4
+	vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip]
+	vpxor ymm5, ymm0, ymm5
+	vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip]
+	vpxor ymm6, ymm0, ymm6
+	vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip]
+	vpxor ymm7, ymm0, ymm7
+
+	vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32)
+	vpsllq ymm14, ymm15, 31                              ;# sign64 (bit 63)
+
+	;# generated SuperscalarHash code goes here
+
+DECL(randomx_dataset_init_avx2_loop_end):
+	#include "asm/program_sshash_avx2_loop_end.inc"
+
+DECL(randomx_dataset_init_avx2_epilogue):
+	#include "asm/program_sshash_avx2_epilogue.inc"
+
+DECL(randomx_dataset_init_avx2_ssh_load):
+	#include "asm/program_sshash_avx2_ssh_load.inc"
+
+DECL(randomx_dataset_init_avx2_ssh_prefetch):
+	#include "asm/program_sshash_avx2_ssh_prefetch.inc"
+
 .balign 64
 DECL(randomx_program_epilogue):
 	#include "asm/program_epilogue_store.inc"
--- a/src/crypto/randomx/jit_compiler_x86_static.asm
+++ b/src/crypto/randomx/jit_compiler_x86_static.asm
@ -41,6 +41,11 @@ PUBLIC randomx_program_read_dataset_ryzen
 PUBLIC randomx_program_read_dataset_sshash_init
 PUBLIC randomx_program_read_dataset_sshash_fin
 PUBLIC randomx_dataset_init
+PUBLIC randomx_dataset_init_avx2_prologue
+PUBLIC randomx_dataset_init_avx2_loop_end
+PUBLIC randomx_dataset_init_avx2_epilogue
+PUBLIC randomx_dataset_init_avx2_ssh_load
+PUBLIC randomx_dataset_init_avx2_ssh_prefetch
 PUBLIC randomx_program_loop_store
 PUBLIC randomx_program_loop_end
 PUBLIC randomx_program_epilogue
@ -183,6 +188,94 @@ init_block_loop:
 randomx_dataset_init ENDP

 ALIGN 64
+randomx_dataset_init_avx2_prologue PROC
+	include asm/program_sshash_avx2_save_registers.inc
+
+	mov rdi, qword ptr [rcx]		;# cache->memory
+	mov rsi, rdx					;# dataset
+	mov rbp, r8						;# block index
+	push r9							;# max. block index
+	sub rsp, 40
+
+	jmp loop_begin
+	include asm/program_sshash_avx2_constants.inc
+
+ALIGN 64
+loop_begin:
+	include asm/program_sshash_avx2_loop_begin.inc
+
+	;# init integer registers (lane 0)
+	lea r8, [rbp+1]
+	imul r8, qword ptr [r0_avx2_mul]
+	mov r9, qword ptr [r1_avx2_add]
+	xor r9, r8
+	mov r10, qword ptr [r2_avx2_add]
+	xor r10, r8
+	mov r11, qword ptr [r3_avx2_add]
+	xor r11, r8
+	mov r12, qword ptr [r4_avx2_add]
+	xor r12, r8
+	mov r13, qword ptr [r5_avx2_add]
+	xor r13, r8
+	mov r14, qword ptr [r6_avx2_add]
+	xor r14, r8
+	mov r15, qword ptr [r7_avx2_add]
+	xor r15, r8
+
+	;# init AVX registers (lanes 1-4)
+	mov qword ptr [rsp+32], rbp
+	vbroadcastsd ymm0, qword ptr [rsp+32]
+	vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments]
+
+	;# ymm0 *= r0_avx2_mul
+	vbroadcastsd ymm1, qword ptr [r0_avx2_mul]
+	vpsrlq ymm8, ymm0, 32
+	vpsrlq ymm9, ymm1, 32
+	vpmuludq ymm10, ymm0, ymm1
+	vpmuludq ymm11, ymm9, ymm0
+	vpmuludq ymm0, ymm8, ymm1
+	vpsllq ymm11, ymm11, 32
+	vpsllq ymm0, ymm0, 32
+	vpaddq ymm10, ymm10, ymm11
+	vpaddq ymm0, ymm10, ymm0
+
+	vbroadcastsd ymm1, qword ptr [r1_avx2_add]
+	vpxor ymm1, ymm0, ymm1
+	vbroadcastsd ymm2, qword ptr [r2_avx2_add]
+	vpxor ymm2, ymm0, ymm2
+	vbroadcastsd ymm3, qword ptr [r3_avx2_add]
+	vpxor ymm3, ymm0, ymm3
+	vbroadcastsd ymm4, qword ptr [r4_avx2_add]
+	vpxor ymm4, ymm0, ymm4
+	vbroadcastsd ymm5, qword ptr [r5_avx2_add]
+	vpxor ymm5, ymm0, ymm5
+	vbroadcastsd ymm6, qword ptr [r6_avx2_add]
+	vpxor ymm6, ymm0, ymm6
+	vbroadcastsd ymm7, qword ptr [r7_avx2_add]
+	vpxor ymm7, ymm0, ymm7
+
+	vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data] ;# carry_bit (bit 32)
+	vpsllq ymm14, ymm15, 31                          ;# sign64 (bit 63)
+randomx_dataset_init_avx2_prologue ENDP
+
+	;# generated SuperscalarHash code goes here
+
+randomx_dataset_init_avx2_loop_end PROC
+	include asm/program_sshash_avx2_loop_end.inc
+randomx_dataset_init_avx2_loop_end ENDP
+
+randomx_dataset_init_avx2_epilogue PROC
+	include asm/program_sshash_avx2_epilogue.inc
+randomx_dataset_init_avx2_epilogue ENDP
+
+randomx_dataset_init_avx2_ssh_load PROC
+	include asm/program_sshash_avx2_ssh_load.inc
+randomx_dataset_init_avx2_ssh_load ENDP
+
+randomx_dataset_init_avx2_ssh_prefetch PROC
+	include asm/program_sshash_avx2_ssh_prefetch.inc
+randomx_dataset_init_avx2_ssh_prefetch ENDP
+
 randomx_program_epilogue PROC
 	include asm/program_epilogue_store.inc
 	include asm/program_epilogue_win64.inc
--- a/src/crypto/randomx/jit_compiler_x86_static.hpp
+++ b/src/crypto/randomx/jit_compiler_x86_static.hpp
@ -44,6 +44,11 @@ extern "C" {
 	void randomx_program_loop_store();
 	void randomx_program_loop_end();
 	void randomx_dataset_init();
+	void randomx_dataset_init_avx2_prologue();
+	void randomx_dataset_init_avx2_loop_end();
+	void randomx_dataset_init_avx2_epilogue();
+	void randomx_dataset_init_avx2_ssh_load();
+	void randomx_dataset_init_avx2_ssh_prefetch();
 	void randomx_program_epilogue();
 	void randomx_sshash_load();
 	void randomx_sshash_prefetch();
--- a/src/crypto/randomx/randomx.cpp
+++ b/src/crypto/randomx/randomx.cpp
@ -53,7 +53,7 @@ extern "C" {
 #include "crypto/randomx/defyx/KangarooTwelve.h"
 }

-#include "base/tools/Profiler.h"
+#include "crypto/rx/Profiler.h"

 RandomX_ConfigurationWownero::RandomX_ConfigurationWownero()
 {
@ -444,9 +444,9 @@ extern "C" {
 					break;

 				case RANDOMX_FLAG_JIT:
-					cache->jit          = new randomx::JitCompiler(false);
+					cache->jit          = new randomx::JitCompiler(false, true);
 					cache->initialize   = &randomx::initCacheCompile;
-					cache->datasetInit  = cache->jit->getDatasetInitFunc();
+					cache->datasetInit  = nullptr;
 					cache->memory       = memory;
 					break;

--- a/src/crypto/randomx/randomx.h
+++ b/src/crypto/randomx/randomx.h
@ -177,6 +177,7 @@ void randomx_apply_config(const T& config)

 void randomx_set_scratchpad_prefetch_mode(int mode);
 void randomx_set_huge_pages_jit(bool hugePages);
+void randomx_set_optimized_dataset_init(int value);

 #if defined(__cplusplus)
 extern "C" {
--- a/src/crypto/randomx/superscalar.cpp
+++ b/src/crypto/randomx/superscalar.cpp
@ -196,7 +196,7 @@ namespace randomx {
 		int latency_;
 		int resultOp_ = 0;
 		int dstOp_ = 0;
-		int srcOp_;
+		int srcOp_ = 0;

 		SuperscalarInstructionInfo(const char* name)
 			: name_(name), type_(SuperscalarInstructionType::INVALID), latency_(0) {}
@ -282,11 +282,11 @@ namespace randomx {
 			return fetchNextDefault(gen);
 		}
 	private:
-		const char* name_;
-		int index_;
-		const int* counts_;
-		int opsCount_;
-		DecoderBuffer() : index_(-1) {}
+		const char* name_ = nullptr;
+		int index_ = -1;
+		const int* counts_ = nullptr;
+		int opsCount_ = 0;
+		DecoderBuffer() = default;
 		static const DecoderBuffer decodeBuffer484;
 		static const DecoderBuffer decodeBuffer7333;
 		static const DecoderBuffer decodeBuffer3733;
@ -555,10 +555,10 @@ namespace randomx {
 		const SuperscalarInstructionInfo* info_;
 		int src_ = -1;
 		int dst_ = -1;
-		int mod_;
-		uint32_t imm32_;
-		SuperscalarInstructionType opGroup_;
-		int opGroupPar_;
+		int mod_ = 0;
+		uint32_t imm32_ = 0;
+		SuperscalarInstructionType opGroup_ = SuperscalarInstructionType::INVALID;
+		int opGroupPar_ = 0;
 		bool canReuse_ = false;
 		bool groupParIsSource_ = false;

--- a/src/crypto/randomx/superscalar_program.hpp
+++ b/src/crypto/randomx/superscalar_program.hpp
@ -39,13 +39,13 @@ namespace randomx {
 		Instruction& operator()(int pc) {
 			return programBuffer[pc];
 		}
-		uint32_t getSize() {
+		uint32_t getSize() const {
 			return size;
 		}
 		void setSize(uint32_t val) {
 			size = val;
 		}
-		int getAddressRegister() {
+		int getAddressRegister() const {
 			return addrReg;
 		}
 		void setAddressRegister(int val) {
--- a/src/crypto/randomx/virtual_machine.cpp
+++ b/src/crypto/randomx/virtual_machine.cpp
@ -30,13 +30,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <iomanip>
 #include <stdexcept>
 #include "crypto/randomx/virtual_machine.hpp"
-#include "crypto/randomx/common.hpp"
 #include "crypto/randomx/aes_hash.hpp"
-#include "crypto/randomx/blake2/blake2.h"
-#include "crypto/randomx/intrin_portable.h"
 #include "crypto/randomx/allocator.hpp"
+#include "crypto/randomx/blake2/blake2.h"
+#include "crypto/randomx/common.hpp"
+#include "crypto/randomx/intrin_portable.h"
 #include "crypto/randomx/soft_aes.h"
-#include "base/tools/Profiler.h"
+#include "crypto/rx/Profiler.h"

 randomx_vm::~randomx_vm() {

--- a/src/crypto/randomx/vm_compiled.cpp
+++ b/src/crypto/randomx/vm_compiled.cpp
@ -1,5 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -28,7 +30,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #include "crypto/randomx/vm_compiled.hpp"
 #include "crypto/randomx/common.hpp"
-#include "base/tools/Profiler.h"
+#include "crypto/rx/Profiler.h"

 namespace randomx {

@ -56,9 +58,9 @@ namespace randomx {
 	void CompiledVm<softAes>::execute() {
 		PROFILE_SCOPE(RandomX_JIT_execute);

-#ifdef XMRIG_ARM
+#		ifdef XMRIG_ARM
 		memcpy(reg.f, config.eMask, sizeof(config.eMask));
-#endif
+#		endif
 		compiler.getProgramFunc()(reg, mem, scratchpad, RandomX_CurrentConfig.ProgramIterations);
 	}

--- a/src/crypto/randomx/vm_compiled.hpp
+++ b/src/crypto/randomx/vm_compiled.hpp
@ -59,7 +59,7 @@ namespace randomx {
 	protected:
 		void execute();

-		JitCompiler compiler{ true };
+		JitCompiler compiler{ true, false };
 	};

 	using CompiledVmDefault = CompiledVm<1>;
--- a/src/crypto/randomx/vm_compiled_light.cpp
+++ b/src/crypto/randomx/vm_compiled_light.cpp
@ -1,5 +1,7 @@
 /*
-Copyright (c) 2018-2019, tevador <tevador@gmail.com>
+Copyright (c) 2018-2020, tevador    <tevador@gmail.com>
+Copyright (c) 2019-2020, SChernykh  <https://github.com/SChernykh>
+Copyright (c) 2019-2020, XMRig      <https://github.com/xmrig>, <support@xmrig.com>

 All rights reserved.

@ -36,6 +38,11 @@ namespace randomx {
 	void CompiledLightVm<softAes>::setCache(randomx_cache* cache) {
 		cachePtr = cache;
 		mem.memory = cache->memory;
+
+#		ifdef XMRIG_SECURE_JIT
+		compiler.enableWriting();
+#		endif
+
 		compiler.generateSuperscalarHash(cache->programs);
 	}

@ -43,7 +50,13 @@ namespace randomx {
 	void CompiledLightVm<softAes>::run(void* seed) {
 		VmBase<softAes>::generateProgram(seed);
 		randomx_vm::initialize();
+
+#		ifdef XMRIG_SECURE_JIT
+		compiler.enableWriting();
+#		endif
+
 		compiler.generateProgramLight(program, config, datasetOffset);
+
 		CompiledVm<softAes>::execute();
 	}