From 4b91978af63237d16b709583bc5c72bfa494e83c Mon Sep 17 00:00:00 2001 From: SChernykh Date: Sun, 21 Oct 2018 18:29:03 +0200 Subject: [PATCH 01/26] Added asm optimized code for AMD Bulldozer --- src/Summary.cpp | 3 +- src/common/xmrig.h | 1 + src/core/ConfigLoader_platform.h | 2 +- src/core/cpu/AdvancedCpuInfo.cpp | 2 +- src/crypto/Asm.cpp | 3 +- src/crypto/CryptoNight_x86.h | 6 +- src/crypto/asm/cnv2_main_loop.S | 9 + src/crypto/asm/cnv2_main_loop.asm | 7 + src/crypto/asm/cnv2_main_loop_bulldozer.inc | 180 ++++++++++++++++++++ src/workers/CpuThread.cpp | 5 +- 10 files changed, 211 insertions(+), 7 deletions(-) create mode 100644 src/crypto/asm/cnv2_main_loop_bulldozer.inc diff --git a/src/Summary.cpp b/src/Summary.cpp index 3c1d06a7..71c456a5 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -43,7 +43,8 @@ static const char *coloredAsmNames[] = { "\x1B[1;31mnone\x1B[0m", "auto", "\x1B[1;32mintel\x1B[0m", - "\x1B[1;32mryzen\x1B[0m" + "\x1B[1;32mryzen\x1B[0m", + "\x1B[1;32mbulldozer\x1B[0m" }; diff --git a/src/common/xmrig.h b/src/common/xmrig.h index 52650f0d..840a9148 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -99,6 +99,7 @@ enum Assembly { ASM_AUTO, ASM_INTEL, ASM_RYZEN, + ASM_BULLDOZER, ASM_MAX }; diff --git a/src/core/ConfigLoader_platform.h b/src/core/ConfigLoader_platform.h index 54546211..807d80e2 100644 --- a/src/core/ConfigLoader_platform.h +++ b/src/core/ConfigLoader_platform.h @@ -84,7 +84,7 @@ Options:\n\ "\ --max-cpu-usage=N maximum CPU usage for automatic threads mode (default 75)\n\ --safe safe adjust threads and av settings for current CPU\n\ - --asm=ASM ASM code for cn/2, possible values: auto, none, intel, ryzen.\n\ + --asm=ASM ASM code for cn/2, possible values: auto, none, intel, ryzen, bulldozer.\n\ --print-time=N print hashrate report every N seconds\n\ --api-port=N port for the miner API\n\ --api-access-token=T access token for API\n\ diff --git a/src/core/cpu/AdvancedCpuInfo.cpp b/src/core/cpu/AdvancedCpuInfo.cpp index c1a9f8cd..06df98ad 100644 --- a/src/core/cpu/AdvancedCpuInfo.cpp +++ b/src/core/cpu/AdvancedCpuInfo.cpp @@ -76,7 +76,7 @@ xmrig::AdvancedCpuInfo::AdvancedCpuInfo() : m_aes = true; if (data.vendor == VENDOR_AMD) { - m_assembly = ASM_RYZEN; + m_assembly = (data.ext_family >= 23) ? ASM_RYZEN : ASM_BULLDOZER; } else if (data.vendor == VENDOR_INTEL) { m_assembly = ASM_INTEL; diff --git a/src/crypto/Asm.cpp b/src/crypto/Asm.cpp index 48c1beb8..9ef04bf9 100644 --- a/src/crypto/Asm.cpp +++ b/src/crypto/Asm.cpp @@ -40,7 +40,8 @@ static const char *asmNames[] = { "none", "auto", "intel", - "ryzen" + "ryzen", + "bulldozer" }; diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 8dcdd414..dfcd1296 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -564,6 +564,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si #ifndef XMRIG_NO_ASM extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); @@ -578,9 +579,12 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_ if (ASM == xmrig::ASM_INTEL) { cnv2_mainloop_ivybridge_asm(ctx[0]); } - else { + else if (ASM == xmrig::ASM_RYZEN) { cnv2_mainloop_ryzen_asm(ctx[0]); } + else { + cnv2_mainloop_bulldozer_asm(ctx[0]); + } cn_implode_scratchpad(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); xmrig::keccakf(reinterpret_cast(ctx[0]->state), 24); diff --git a/src/crypto/asm/cnv2_main_loop.S b/src/crypto/asm/cnv2_main_loop.S index 4dbcbbda..a23f24bf 100644 --- a/src/crypto/asm/cnv2_main_loop.S +++ b/src/crypto/asm/cnv2_main_loop.S @@ -9,6 +9,7 @@ #endif .global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) ALIGN 16 @@ -27,6 +28,14 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): add rsp, 48 ret 0 +ALIGN 16 +FN_PREFIX(cnv2_mainloop_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_bulldozer.inc" + add rsp, 48 + ret 0 + ALIGN 16 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): sub rsp, 48 diff --git a/src/crypto/asm/cnv2_main_loop.asm b/src/crypto/asm/cnv2_main_loop.asm index d9522267..557f1ab6 100644 --- a/src/crypto/asm/cnv2_main_loop.asm +++ b/src/crypto/asm/cnv2_main_loop.asm @@ -1,6 +1,7 @@ _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm +PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm ALIGN 64 @@ -15,6 +16,12 @@ cnv2_mainloop_ryzen_asm PROC ret 0 cnv2_mainloop_ryzen_asm ENDP +ALIGN 64 +cnv2_mainloop_bulldozer_asm PROC + INCLUDE cnv2_main_loop_bulldozer.inc + ret 0 +cnv2_mainloop_bulldozer_asm ENDP + ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cnv2_double_main_loop_sandybridge.inc diff --git a/src/crypto/asm/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/cnv2_main_loop_bulldozer.inc new file mode 100644 index 00000000..478976c0 --- /dev/null +++ b/src/crypto/asm/cnv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movq r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movq xmm0, rax + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_bulldozer + shr rdi, 19 + +sqrt_fixup_bulldozer_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_bulldozer_endp + +sqrt_fixup_bulldozer: + movq r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_bulldozer_ret + +cnv2_main_loop_bulldozer_endp: diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 4b528148..aee26b0d 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -64,7 +64,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a assert(variant >= VARIANT_0 && variant < VARIANT_MAX); # ifndef XMRIG_NO_ASM - constexpr const size_t count = VARIANT_MAX * 10 * 3 + 3; + constexpr const size_t count = VARIANT_MAX * 10 * 3 + 4; # else constexpr const size_t count = VARIANT_MAX * 10 * 3; # endif @@ -249,6 +249,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a # ifndef XMRIG_NO_ASM cryptonight_single_hash_asm, cryptonight_single_hash_asm, + cryptonight_single_hash_asm, cryptonight_double_hash_asm # endif }; @@ -447,7 +448,7 @@ size_t xmrig::CpuThread::fnIndex(Algo algorithm, AlgoVariant av, Variant variant } if (av == AV_DOUBLE) { - return offset + 2; + return offset + 3; } } # endif From d7feb2719e4406aadced124a900d0cb2c8317f42 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 21 Oct 2018 23:29:17 +0700 Subject: [PATCH 02/26] v2.8.5-dev --- src/version.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/version.h b/src/version.h index 96dedd54..4f929666 100644 --- a/src/version.h +++ b/src/version.h @@ -27,7 +27,7 @@ #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "XMRig CPU miner" -#define APP_VERSION "2.8.3" +#define APP_VERSION "2.8.5-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2018 xmrig.com" @@ -35,7 +35,7 @@ #define APP_VER_MAJOR 2 #define APP_VER_MINOR 8 -#define APP_VER_PATCH 3 +#define APP_VER_PATCH 5 #ifdef _MSC_VER # if (_MSC_VER >= 1910) From deb832c9c675d6920ea7b6d27d3e7b2412041199 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 21 Oct 2018 23:52:23 +0700 Subject: [PATCH 03/26] Restore old method for total threads count detection. --- src/core/cpu/AdvancedCpuInfo.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/core/cpu/AdvancedCpuInfo.cpp b/src/core/cpu/AdvancedCpuInfo.cpp index c1a9f8cd..1f738072 100644 --- a/src/core/cpu/AdvancedCpuInfo.cpp +++ b/src/core/cpu/AdvancedCpuInfo.cpp @@ -24,7 +24,6 @@ #include #include #include -#include #include "core/cpu/AdvancedCpuInfo.h" @@ -39,7 +38,7 @@ xmrig::AdvancedCpuInfo::AdvancedCpuInfo() : m_L2(0), m_L3(0), m_sockets(1), - m_threads(std::thread::hardware_concurrency()) + m_threads(0) { struct cpu_raw_data_t raw = { 0 }; struct cpu_id_t data = { 0 }; @@ -49,6 +48,7 @@ xmrig::AdvancedCpuInfo::AdvancedCpuInfo() : strncpy(m_brand, data.brand_str, sizeof(m_brand)); + m_threads = data.total_logical_cpus; m_sockets = threads() / data.num_logical_cpus; if (m_sockets == 0) { m_sockets = 1; From 06a84499d7e5100686c4757e68df3223bc2ed716 Mon Sep 17 00:00:00 2001 From: XMRig Date: Mon, 22 Oct 2018 23:08:29 +0700 Subject: [PATCH 04/26] Fixed MSYS2 build & copyright --- src/Summary.cpp | 1 + src/common/xmrig.h | 1 + src/core/cpu/AdvancedCpuInfo.cpp | 1 + src/crypto/Asm.cpp | 1 + src/crypto/asm/win64/cnv2_main_loop.S | 6 ++++++ src/crypto/asm/win64/cnv2_main_loop.asm | 7 +++++++ src/workers/CpuThread.cpp | 1 + 7 files changed, 18 insertions(+) diff --git a/src/Summary.cpp b/src/Summary.cpp index 71c456a5..f010d70b 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -5,6 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify diff --git a/src/common/xmrig.h b/src/common/xmrig.h index 840a9148..20306d1c 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -5,6 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify diff --git a/src/core/cpu/AdvancedCpuInfo.cpp b/src/core/cpu/AdvancedCpuInfo.cpp index 74ddd9e8..a9152136 100644 --- a/src/core/cpu/AdvancedCpuInfo.cpp +++ b/src/core/cpu/AdvancedCpuInfo.cpp @@ -5,6 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify diff --git a/src/crypto/Asm.cpp b/src/crypto/Asm.cpp index 9ef04bf9..88812c6c 100644 --- a/src/crypto/Asm.cpp +++ b/src/crypto/Asm.cpp @@ -5,6 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify diff --git a/src/crypto/asm/win64/cnv2_main_loop.S b/src/crypto/asm/win64/cnv2_main_loop.S index 78eb1185..1be27c64 100644 --- a/src/crypto/asm/win64/cnv2_main_loop.S +++ b/src/crypto/asm/win64/cnv2_main_loop.S @@ -3,6 +3,7 @@ .section .text .global cnv2_mainloop_ivybridge_asm .global cnv2_mainloop_ryzen_asm +.global cnv2_mainloop_bulldozer_asm .global cnv2_double_mainloop_sandybridge_asm ALIGN 16 @@ -15,6 +16,11 @@ cnv2_mainloop_ryzen_asm: #include "../cnv2_main_loop_ryzen.inc" ret 0 +ALIGN 16 +cnv2_mainloop_bulldozer_asm: + #include "../cnv2_main_loop_bulldozer.inc" + ret 0 + ALIGN 16 cnv2_double_mainloop_sandybridge_asm: #include "../cnv2_double_main_loop_sandybridge.inc" diff --git a/src/crypto/asm/win64/cnv2_main_loop.asm b/src/crypto/asm/win64/cnv2_main_loop.asm index d9522267..557f1ab6 100644 --- a/src/crypto/asm/win64/cnv2_main_loop.asm +++ b/src/crypto/asm/win64/cnv2_main_loop.asm @@ -1,6 +1,7 @@ _TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm +PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm ALIGN 64 @@ -15,6 +16,12 @@ cnv2_mainloop_ryzen_asm PROC ret 0 cnv2_mainloop_ryzen_asm ENDP +ALIGN 64 +cnv2_mainloop_bulldozer_asm PROC + INCLUDE cnv2_main_loop_bulldozer.inc + ret 0 +cnv2_mainloop_bulldozer_asm ENDP + ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cnv2_double_main_loop_sandybridge.inc diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index aee26b0d..b6e91a65 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -5,6 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify From 735180ac04f896bd461c4e8962e077021e27c25e Mon Sep 17 00:00:00 2001 From: XMRig Date: Mon, 22 Oct 2018 23:17:54 +0700 Subject: [PATCH 05/26] Fixed MSVC 2015 build. --- .../asm/win64/cnv2_main_loop_bulldozer.inc | 180 ++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 src/crypto/asm/win64/cnv2_main_loop_bulldozer.inc diff --git a/src/crypto/asm/win64/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/win64/cnv2_main_loop_bulldozer.inc new file mode 100644 index 00000000..c19e9d69 --- /dev/null +++ b/src/crypto/asm/win64/cnv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movd r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movd xmm0, rax + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_bulldozer + shr rdi, 19 + +sqrt_fixup_bulldozer_ret: + mov rax, rsi + mul r14 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_bulldozer_endp + +sqrt_fixup_bulldozer: + movd r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_bulldozer_ret + +cnv2_main_loop_bulldozer_endp: From acd042c23454c2be617cbdf4d1b562b7c542a2c1 Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 24 Oct 2018 09:55:40 +0700 Subject: [PATCH 06/26] #839 Fixed FreeBSD compile. --- CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 1becac5c..4ebfdbd3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -155,11 +155,11 @@ else() src/Mem_unix.cpp ) - set(EXTRA_LIBS pthread rt dl) -endif() - -if (CMAKE_SYSTEM_NAME STREQUAL FreeBSD) - set(EXTRA_LIBS ${EXTRA_LIBS} kvm) + if (CMAKE_SYSTEM_NAME STREQUAL FreeBSD) + set(EXTRA_LIBS kvm) + else() + set(EXTRA_LIBS pthread rt dl) + endif() endif() if (CMAKE_SYSTEM_NAME MATCHES "Linux") From 3d60b3cc624838d5359f4a85f25a19c5df3c3547 Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 25 Oct 2018 14:07:15 +0700 Subject: [PATCH 07/26] #844 Fixed copy/paste typo in comment. --- src/common/config/CommonConfig.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/common/config/CommonConfig.cpp b/src/common/config/CommonConfig.cpp index beb2d0c9..01fdfa21 100644 --- a/src/common/config/CommonConfig.cpp +++ b/src/common/config/CommonConfig.cpp @@ -400,7 +400,7 @@ bool xmrig::CommonConfig::parseString(int key, const char *arg) case RetriesKey: /* --retries */ case RetryPauseKey: /* --retry-pause */ case ApiPort: /* --api-port */ - case PrintTimeKey: /* --cpu-priority */ + case PrintTimeKey: /* --print-time */ return parseUint64(key, strtol(arg, nullptr, 10)); case BackgroundKey: /* --background */ From 16babcc6bc9469d08dee4bfec8c99e4c9788223a Mon Sep 17 00:00:00 2001 From: WHR Date: Tue, 30 Oct 2018 18:41:59 +0800 Subject: [PATCH 08/26] Fix impossible to build for OS X without clang --- src/common/Platform_mac.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/common/Platform_mac.cpp b/src/common/Platform_mac.cpp index d0c533b0..c974d90d 100644 --- a/src/common/Platform_mac.cpp +++ b/src/common/Platform_mac.cpp @@ -46,9 +46,16 @@ char *Platform::createUserAgent() # ifdef XMRIG_NVIDIA_PROJECT const int cudaVersion = cuda_get_runtime_version(); - snprintf(buf, max, "%s/%s (Macintosh; Intel Mac OS X) libuv/%s CUDA/%d.%d clang/%d.%d.%d", APP_NAME, APP_VERSION, uv_version_string(), cudaVersion / 1000, cudaVersion % 100, __clang_major__, __clang_minor__, __clang_patchlevel__); + snprintf(buf, max, "%s/%s (Macintosh; Intel Mac OS X) libuv/%s CUDA/%d.%d", APP_NAME, APP_VERSION, uv_version_string(), cudaVersion / 1000, cudaVersion % 100); # else - snprintf(buf, max, "%s/%s (Macintosh; Intel Mac OS X) libuv/%s clang/%d.%d.%d", APP_NAME, APP_VERSION, uv_version_string(), __clang_major__, __clang_minor__, __clang_patchlevel__); + snprintf(buf, max, "%s/%s (Macintosh; Intel Mac OS X) libuv/%s", APP_NAME, APP_VERSION, uv_version_string()); +# endif +# ifdef __clang__ + size_t i = strlen(buf); + snprintf(buf + i, max - i, " clang/%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__); +# elif defined(__GNUC__) + size_t i = strlen(buf); + snprintf(buf + i, max - i, " gcc/%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); # endif return buf; From a76243a65ec4b38aa887de7ea4ebc0bd928dd417 Mon Sep 17 00:00:00 2001 From: XMRig Date: Tue, 6 Nov 2018 00:50:28 +0700 Subject: [PATCH 09/26] Sync changes with proxy. --- CMakeLists.txt | 2 + src/base/tools/String.cpp | 225 +++++++++++++++++++++++++++++ src/base/tools/String.h | 103 +++++++++++++ src/common/config/CommonConfig.cpp | 51 +++++-- src/common/config/CommonConfig.h | 4 +- src/common/interfaces/IConfig.h | 27 ++-- src/common/log/Log.h | 8 +- src/common/net/Tls.h | 6 +- src/common/utils/c_str.h | 73 +--------- 9 files changed, 402 insertions(+), 97 deletions(-) create mode 100644 src/base/tools/String.cpp create mode 100644 src/base/tools/String.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 4ebfdbd3..4d910e6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,6 +17,7 @@ include (cmake/cpu.cmake) set(HEADERS src/api/NetworkState.h src/App.h + src/base/tools/String.h src/common/config/CommonConfig.h src/common/config/ConfigLoader.h src/common/config/ConfigWatcher.h @@ -93,6 +94,7 @@ endif() set(SOURCES src/api/NetworkState.cpp src/App.cpp + src/base/tools/String.cpp src/common/config/CommonConfig.cpp src/common/config/ConfigLoader.cpp src/common/config/ConfigWatcher.cpp diff --git a/src/base/tools/String.cpp b/src/base/tools/String.cpp new file mode 100644 index 00000000..fe2792c7 --- /dev/null +++ b/src/base/tools/String.cpp @@ -0,0 +1,225 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2016-2018 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + + +#include "base/tools/String.h" +#include "rapidjson/document.h" + + +xmrig::String::String(const char *str) : + m_data(nullptr), + m_size(str == nullptr ? 0 : strlen(str)) +{ + if (m_size == 0) { + return; + } + + m_data = new char[m_size + 1]; + memcpy(m_data, str, m_size + 1); +} + + +xmrig::String::String(const char *str, size_t size) : + m_data(nullptr), + m_size(size) +{ + if (str == nullptr) { + m_size = 0; + + return; + } + + m_data = new char[m_size + 1]; + memcpy(m_data, str, m_size); + m_data[m_size] = '\0'; +} + + +xmrig::String::String(const String &other) : + m_data(nullptr), + m_size(other.m_size) +{ + if (other.m_data == nullptr) { + return; + } + + m_data = new char[m_size + 1]; + memcpy(m_data, other.m_data, m_size + 1); +} + + +bool xmrig::String::isEqual(const char *str) const +{ + return (m_data != nullptr && str != nullptr && strcmp(m_data, str) == 0) || (m_data == nullptr && str == nullptr); +} + + +bool xmrig::String::isEqual(const String &other) const +{ + if (m_size != other.m_size) { + return false; + } + + return (m_data != nullptr && other.m_data != nullptr && memcmp(m_data, other.m_data, m_size) == 0) || (m_data == nullptr && other.m_data == nullptr); +} + + +rapidjson::Value xmrig::String::toJSON() const +{ + using namespace rapidjson; + + return isNull() ? Value(kNullType) : Value(StringRef(m_data)); +} + + +rapidjson::Value xmrig::String::toJSON(rapidjson::Document &doc) const +{ + using namespace rapidjson; + + return isNull() ? Value(kNullType) : Value(m_data, doc.GetAllocator()); +} + + +std::vector xmrig::String::split(char sep) const +{ + std::vector out; + if (m_size == 0) { + return out; + } + + size_t start = 0; + size_t pos = 0; + + for (pos = 0; pos < m_size; ++pos) { + if (m_data[pos] == sep) { + if ((pos - start) > 0) { + out.push_back(std::move(String(m_data + start, pos - start))); + } + + start = pos + 1; + } + } + + if ((pos - start) > 0) { + out.push_back(std::move(String(m_data + start, pos - start))); + } + + return out; +} + + +xmrig::String xmrig::String::join(const std::vector &vec, char sep) +{ + if (vec.empty()) { + return String(); + } + + size_t size = vec.size(); + for (const String &str : vec) { + size += str.size(); + } + + size_t offset = 0; + char *buf = new char[size]; + + for (const String &str : vec) { + memcpy(buf + offset, str.data(), str.size()); + + offset += str.size() + 1; + + if (offset < size) { + buf[offset - 1] = sep; + } + } + + buf[size - 1] = '\0'; + + return String(buf); +} + + +void xmrig::String::copy(const char *str) +{ + delete [] m_data; + + if (str == nullptr) { + m_size = 0; + m_data = nullptr; + + return; + } + + m_size = strlen(str); + m_data = new char[m_size + 1]; + + memcpy(m_data, str, m_size + 1); +} + + +void xmrig::String::copy(const String &other) +{ + if (m_size > 0) { + if (m_size == other.m_size) { + memcpy(m_data, other.m_data, m_size + 1); + + return; + } + + delete [] m_data; + } + + delete [] m_data; + + if (other.m_data == nullptr) { + m_size = 0; + m_data = nullptr; + + return; + } + + m_size = other.m_size; + m_data = new char[m_size + 1]; + + memcpy(m_data, other.m_data, m_size + 1); +} + + +void xmrig::String::move(char *str) +{ + delete [] m_data; + + m_size = str == nullptr ? 0 : strlen(str); + m_data = str; +} + + +void xmrig::String::move(String &&other) +{ + delete [] m_data; + + m_data = other.m_data; + m_size = other.m_size; + + other.m_data = nullptr; + other.m_size = 0; +} diff --git a/src/base/tools/String.h b/src/base/tools/String.h new file mode 100644 index 00000000..b2da0940 --- /dev/null +++ b/src/base/tools/String.h @@ -0,0 +1,103 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2016-2018 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef XMRIG_STRING_H +#define XMRIG_STRING_H + + +#include +#include + + +#include "rapidjson/fwd.h" + + +namespace xmrig { + + +/** + * @brief Simple C string wrapper. + * + * 1. I know about std:string. + * 2. For some reason I prefer don't use std:string in miner, eg because of file size of MSYS2 builds. + * 3. nullptr and JSON conversion supported. + */ +class String +{ +public: + inline String() : m_data(nullptr), m_size(0) {} + inline String(char *str) : m_data(str), m_size(str == nullptr ? 0 : strlen(str)) {} + inline String(String &&other) : m_data(other.m_data), m_size(other.m_size) { other.m_data = nullptr; other.m_size = 0; } + + String(const char *str); + String(const char *str, size_t size); + String(const String &other); + + inline ~String() { delete [] m_data; } + + + bool isEqual(const char *str) const; + bool isEqual(const String &other) const; + + + inline bool contains(const char *str) const { return strstr(m_data, str) != nullptr; } + + + inline bool isEmpty() const { return size() == 0; } + inline bool isNull() const { return m_data == nullptr; } + inline char *data() { return m_data; } + inline const char *data() const { return m_data; } + inline size_t size() const { return m_size; } + + + inline bool operator!=(const char *str) const { return !isEqual(str); } + inline bool operator!=(const String &other) const { return !isEqual(other); } + inline bool operator<(const String &str) const { return strcmp(data(), str.data()) < 0; } + inline bool operator==(const char *str) const { return isEqual(str); } + inline bool operator==(const String &other) const { return isEqual(other); } + inline String &operator=(char *str) { move(str); return *this; } + inline String &operator=(const char *str) { copy(str); return *this; } + inline String &operator=(const String &str) { copy(str); return *this; } + inline String &operator=(String &&other) { move(std::move(other)); return *this; } + + rapidjson::Value toJSON() const; + rapidjson::Value toJSON(rapidjson::Document &doc) const; + std::vector split(char sep) const; + + static String join(const std::vector &vec, char sep); + +private: + void copy(const char *str); + void copy(const String &other); + void move(char *str); + void move(String &&other); + + char *m_data; + size_t m_size; +}; + + +} /* namespace xmrig */ + + +#endif /* XMRIG_STRING_H */ diff --git a/src/common/config/CommonConfig.cpp b/src/common/config/CommonConfig.cpp index 01fdfa21..94399d7d 100644 --- a/src/common/config/CommonConfig.cpp +++ b/src/common/config/CommonConfig.cpp @@ -280,16 +280,16 @@ bool xmrig::CommonConfig::parseBoolean(int key, bool enable) break; case KeepAliveKey: /* --keepalive */ - m_pools.back().setKeepAlive(enable ? Pool::kKeepAliveTimeout : 0); + currentPool().setKeepAlive(enable ? Pool::kKeepAliveTimeout : 0); break; case TlsKey: /* --tls */ - m_pools.back().setTLS(enable); + currentPool().setTLS(enable); break; # ifndef XMRIG_PROXY_PROJECT case NicehashKey: /* --nicehash */ - m_pools.back().setNicehash(enable); + currentPool().setNicehash(enable); break; # endif @@ -333,13 +333,15 @@ bool xmrig::CommonConfig::parseString(int key, const char *arg) break; case UserpassKey: /* --userpass */ - if (!m_pools.back().setUserpass(arg)) { + if (!currentPool().setUserpass(arg)) { return false; } break; case UrlKey: /* --url */ + fixup(); + if (m_pools.size() > 1 || m_pools[0].isValid()) { Pool pool(arg); @@ -358,23 +360,23 @@ bool xmrig::CommonConfig::parseString(int key, const char *arg) break; case UserKey: /* --user */ - m_pools.back().setUser(arg); + currentPool().setUser(arg); break; case PasswordKey: /* --pass */ - m_pools.back().setPassword(arg); + currentPool().setPassword(arg); break; case RigIdKey: /* --rig-id */ - m_pools.back().setRigId(arg); + currentPool().setRigId(arg); break; case FingerprintKey: /* --tls-fingerprint */ - m_pools.back().setFingerprint(arg); + currentPool().setFingerprint(arg); break; case VariantKey: /* --variant */ - m_pools.back().algorithm().parseVariant(arg); + currentPool().algorithm().parseVariant(arg); break; case LogFileKey: /* --log-file */ @@ -462,11 +464,11 @@ bool xmrig::CommonConfig::parseInt(int key, int arg) break; case KeepAliveKey: /* --keepalive */ - m_pools.back().setKeepAlive(arg); + currentPool().setKeepAlive(arg); break; case VariantKey: /* --variant */ - m_pools.back().algorithm().parseVariant(arg); + currentPool().algorithm().parseVariant(arg); break; case DonateLevelKey: /* --donate-level */ @@ -493,3 +495,30 @@ bool xmrig::CommonConfig::parseInt(int key, int arg) return true; } + + +Pool &xmrig::CommonConfig::currentPool() +{ + fixup(); + + return m_pools.back(); +} + + +void xmrig::CommonConfig::fixup() +{ + if (m_state == NoneState) { + return; + } + + if (m_pools.empty()) { + if (!m_activePools.empty()) { + std::swap(m_pools, m_activePools); + } + else { + m_pools.push_back(Pool()); + } + + m_state = NoneState; + } +} diff --git a/src/common/config/CommonConfig.h b/src/common/config/CommonConfig.h index 422a6bb2..a864033b 100644 --- a/src/common/config/CommonConfig.h +++ b/src/common/config/CommonConfig.h @@ -112,9 +112,11 @@ protected: private: bool parseInt(int key, int arg); + Pool ¤tPool(); + void fixup(); }; } /* namespace xmrig */ -#endif /* __COMMONCONFIG_H__ */ +#endif /* XMRIG_COMMONCONFIG_H */ diff --git a/src/common/interfaces/IConfig.h b/src/common/interfaces/IConfig.h index 69f2ffab..0c8cfc28 100644 --- a/src/common/interfaces/IConfig.h +++ b/src/common/interfaces/IConfig.h @@ -97,16 +97,23 @@ public: OclCompModeKey = 1410, // xmrig-proxy - AccessLogFileKey = 'A', - BindKey = 'b', - CoinKey = 1104, - CustomDiffKey = 1102, - DebugKey = 1101, - ModeKey = 'm', - PoolCoinKey = 'C', - ReuseTimeoutKey = 1106, - WorkersKey = 1103, - WorkersAdvKey = 1107, + AccessLogFileKey = 'A', + BindKey = 'b', + CoinKey = 1104, + CustomDiffKey = 1102, + DebugKey = 1101, + ModeKey = 'm', + PoolCoinKey = 'C', + ReuseTimeoutKey = 1106, + WorkersKey = 1103, + WorkersAdvKey = 1107, + TlsBindKey = 1108, + TlsCertKey = 1109, + TlsCertKeyKey = 1110, + TlsDHparamKey = 1111, + TlsCiphersKey = 1112, + TlsCipherSuitesKey = 1113, + TlsProtocolsKey = 1114, // xmrig nvidia CudaMaxThreadsKey = 1200, diff --git a/src/common/log/Log.h b/src/common/log/Log.h index 2774ae0c..788ad263 100644 --- a/src/common/log/Log.h +++ b/src/common/log/Log.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __LOG_H__ -#define __LOG_H__ +#ifndef XMRIG_LOG_H +#define XMRIG_LOG_H #include @@ -39,7 +39,7 @@ public: static inline Log* i() { if (!m_self) { defaultInit(); } return m_self; } static inline void add(ILogBackend *backend) { i()->m_backends.push_back(backend); } static inline void init() { if (!m_self) { new Log(); } } - static inline void release() { assert(m_self != nullptr); delete m_self; } + static inline void release() { delete m_self; } void message(ILogBackend::Level level, const char* fmt, ...); void text(const char* fmt, ...); @@ -98,4 +98,4 @@ private: # define LOG_DEBUG_WARN(x, ...) #endif -#endif /* __LOG_H__ */ +#endif /* XMRIG_LOG_H */ diff --git a/src/common/net/Tls.h b/src/common/net/Tls.h index 6e38f32f..083adfc4 100644 --- a/src/common/net/Tls.h +++ b/src/common/net/Tls.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef XMRIG_TLS_H -#define XMRIG_TLS_H +#ifndef XMRIG_CLIENT_TLS_H +#define XMRIG_CLIENT_TLS_H #include @@ -59,4 +59,4 @@ private: }; -#endif /* XMRIG_TLS_H */ +#endif /* XMRIG_CLIENT_TLS_H */ diff --git a/src/common/utils/c_str.h b/src/common/utils/c_str.h index 7ce63754..fe0164b9 100644 --- a/src/common/utils/c_str.h +++ b/src/common/utils/c_str.h @@ -21,82 +21,19 @@ * along with this program. If not, see . */ -#ifndef __C_STR_H__ -#define __C_STR_H__ +#ifndef XMRIG_C_STR_H +#define XMRIG_C_STR_H -#include -#include - -#include +#include "base/tools/String.h" namespace xmrig { -/** - * @brief Simple C string wrapper. - * - * 1. I know about std:string. - * 2. For some reason I prefer don't use std:string in miner, eg because of file size of MSYS2 builds. - */ -class c_str -{ -public: - inline c_str() : m_data(nullptr) {} - inline c_str(c_str &&other) { m_data = other.m_data; other.m_data = nullptr; } - inline c_str(const c_str &other) : m_data(nullptr) { set(other.data()); } - inline c_str(const char *str) : m_data(nullptr) { set(str); } - inline ~c_str() { free(m_data); } - - - inline void set(const char *str) - { - free(m_data); - - m_data = str != nullptr ? strdup(str) : nullptr; - } - - - inline void set(char *str) - { - free(m_data); - - m_data = str; - } - - - inline bool isEqual(const char *str) const - { - return (m_data != nullptr && str != nullptr && strcmp(m_data, str) == 0) || (m_data == nullptr && m_data == nullptr); - } - - - inline bool contains(const char *str) const - { - return strstr(m_data, str) != nullptr; - } - - - inline bool isNull() const { return m_data == nullptr; } - inline const char *data() const { return m_data; } - inline size_t size() const { return m_data == nullptr ? 0 : strlen(m_data); } - - - inline bool operator!=(const c_str &str) const { return !isEqual(str.data()); } - inline bool operator!=(const char *str) const { return !isEqual(str); } - inline bool operator==(const c_str &str) const { return isEqual(str.data()); } - inline bool operator==(const char *str) const { return isEqual(str); } - inline c_str &operator=(char *str) { set(str); return *this; } - inline c_str &operator=(const c_str &str) { set(str.data()); return *this; } - inline c_str &operator=(const char *str) { set(str); return *this; } - - -private: - char *m_data; -}; +typedef String c_str; } /* namespace xmrig */ -#endif /* __C_STR_H__ */ +#endif /* XMRIG_C_STR_H */ From 9f6f599d785fb4138e6a4d10e0c2c790dde36eb7 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 11 Nov 2018 03:18:56 +0700 Subject: [PATCH 10/26] Sync changes. --- CMakeLists.txt | 4 ++-- cmake/OpenSSL.cmake | 2 ++ src/App.cpp | 4 ++++ src/common/config/CommonConfig.cpp | 3 +++ src/common/config/ConfigLoader.cpp | 5 +++++ src/common/config/ConfigLoader.h | 9 ++++++--- src/core/Controller.cpp | 6 ++++++ src/core/Controller.h | 7 ++++--- 8 files changed, 32 insertions(+), 8 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 4d910e6e..039bf418 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -257,5 +257,5 @@ if (WITH_DEBUG_LOG) add_definitions(/DAPP_DEBUG) endif() -add_executable(${PROJECT_NAME} ${HEADERS} ${SOURCES} ${SOURCES_OS} ${SOURCES_CPUID} ${HEADERS_CRYPTO} ${SOURCES_CRYPTO} ${SOURCES_SYSLOG} ${HTTPD_SOURCES} ${TLS_SOURCES} ${XMRIG_ASM_SOURCES}) -target_link_libraries(${PROJECT_NAME} ${XMRIG_ASM_LIBRARY} ${OPENSSL_LIBRARIES} ${UV_LIBRARIES} ${MHD_LIBRARY} ${EXTRA_LIBS} ${CPUID_LIB}) +add_executable(${CMAKE_PROJECT_NAME} ${HEADERS} ${SOURCES} ${SOURCES_OS} ${SOURCES_CPUID} ${HEADERS_CRYPTO} ${SOURCES_CRYPTO} ${SOURCES_SYSLOG} ${HTTPD_SOURCES} ${TLS_SOURCES} ${XMRIG_ASM_SOURCES}) +target_link_libraries(${CMAKE_PROJECT_NAME} ${XMRIG_ASM_LIBRARY} ${OPENSSL_LIBRARIES} ${UV_LIBRARIES} ${MHD_LIBRARY} ${EXTRA_LIBS} ${CPUID_LIB}) diff --git a/cmake/OpenSSL.cmake b/cmake/OpenSSL.cmake index ed287e7e..79731d6f 100644 --- a/cmake/OpenSSL.cmake +++ b/cmake/OpenSSL.cmake @@ -20,4 +20,6 @@ else() set(TLS_SOURCES "") set(OPENSSL_LIBRARIES "") add_definitions(/DXMRIG_NO_TLS) + + set(CMAKE_PROJECT_NAME "${CMAKE_PROJECT_NAME}-notls") endif() diff --git a/src/App.cpp b/src/App.cpp index 134e4ef5..e3c4c222 100644 --- a/src/App.cpp +++ b/src/App.cpp @@ -87,6 +87,10 @@ App::~App() int App::exec() { + if (m_controller->isDone()) { + return 0; + } + if (!m_controller->isReady()) { return 2; } diff --git a/src/common/config/CommonConfig.cpp b/src/common/config/CommonConfig.cpp index 94399d7d..f2a00182 100644 --- a/src/common/config/CommonConfig.cpp +++ b/src/common/config/CommonConfig.cpp @@ -174,8 +174,11 @@ void xmrig::CommonConfig::printVersions() int length = snprintf(buf, sizeof buf, "CUDA/%d.%d ", cudaVersion / 1000, cudaVersion % 100); # else memset(buf, 0, 16); + +# if !defined(XMRIG_NO_HTTPD) || !defined(XMRIG_NO_TLS) int length = 0; # endif +# endif # if !defined(XMRIG_NO_TLS) && defined(OPENSSL_VERSION_TEXT) { diff --git a/src/common/config/ConfigLoader.cpp b/src/common/config/ConfigLoader.cpp index 484c2f8f..0365e151 100644 --- a/src/common/config/ConfigLoader.cpp +++ b/src/common/config/ConfigLoader.cpp @@ -50,6 +50,7 @@ #include "rapidjson/filereadstream.h" +bool xmrig::ConfigLoader::m_done = false; xmrig::ConfigWatcher *xmrig::ConfigLoader::m_watcher = nullptr; xmrig::IConfigCreator *xmrig::ConfigLoader::m_creator = nullptr; xmrig::IWatcherListener *xmrig::ConfigLoader::m_listener = nullptr; @@ -283,12 +284,16 @@ void xmrig::ConfigLoader::parseJSON(xmrig::IConfig *config, const struct option void xmrig::ConfigLoader::showUsage() { + m_done = true; + printf(usage); } void xmrig::ConfigLoader::showVersion() { + m_done = true; + printf(APP_NAME " " APP_VERSION "\n built on " __DATE__ # if defined(__clang__) diff --git a/src/common/config/ConfigLoader.h b/src/common/config/ConfigLoader.h index 64638af3..840080f9 100644 --- a/src/common/config/ConfigLoader.h +++ b/src/common/config/ConfigLoader.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __CONFIGLOADER_H__ -#define __CONFIGLOADER_H__ +#ifndef XMRIG_CONFIGLOADER_H +#define XMRIG_CONFIGLOADER_H #include @@ -53,6 +53,8 @@ public: static IConfig *load(int argc, char **argv, IConfigCreator *creator, IWatcherListener *listener); static void release(); + static inline bool isDone() { return m_done; } + private: static bool getJSON(const char *fileName, rapidjson::Document &doc); static bool parseArg(IConfig *config, int key, const char *arg); @@ -60,6 +62,7 @@ private: static void showUsage(); static void showVersion(); + static bool m_done; static ConfigWatcher *m_watcher; static IConfigCreator *m_creator; static IWatcherListener *m_listener; @@ -68,4 +71,4 @@ private: } /* namespace xmrig */ -#endif /* __CONFIGLOADER_H__ */ +#endif /* XMRIG_CONFIGLOADER_H */ diff --git a/src/core/Controller.cpp b/src/core/Controller.cpp index 792ac939..7a9b8284 100644 --- a/src/core/Controller.cpp +++ b/src/core/Controller.cpp @@ -78,6 +78,12 @@ xmrig::Controller::~Controller() } +bool xmrig::Controller::isDone() const +{ + return ConfigLoader::isDone(); +} + + bool xmrig::Controller::isReady() const { return d_ptr->config && d_ptr->network; diff --git a/src/core/Controller.h b/src/core/Controller.h index 2c66af53..abb11ecf 100644 --- a/src/core/Controller.h +++ b/src/core/Controller.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __CONTROLLER_H__ -#define __CONTROLLER_H__ +#ifndef XMRIG_CONTROLLER_H +#define XMRIG_CONTROLLER_H #include "common/interfaces/IWatcherListener.h" @@ -46,6 +46,7 @@ public: Controller(); ~Controller(); + bool isDone() const; bool isReady() const; Config *config() const; int init(int argc, char **argv); @@ -61,4 +62,4 @@ private: } /* namespace xmrig */ -#endif /* __CONTROLLER_H__ */ +#endif /* XMRIG_CONTROLLER_H */ From 2a2712ab90e10487cf7b506f7f34db5f8f3a0f5c Mon Sep 17 00:00:00 2001 From: XMRig Date: Tue, 20 Nov 2018 07:24:14 +0700 Subject: [PATCH 11/26] Sync changes. --- src/common/Platform.cpp | 7 ++++--- src/common/Platform.h | 11 +++++++---- src/common/Platform_mac.cpp | 12 +++++++++++- src/common/Platform_unix.cpp | 11 +++++++++++ src/common/Platform_win.cpp | 38 ++++++++++++++++++++++++++++++++++-- src/common/net/Client.cpp | 6 ++++++ src/common/net/Id.h | 6 +++--- src/common/net/Job.cpp | 18 ++++++----------- src/common/net/Job.h | 5 +++-- src/common/utils/timestamp.h | 10 +++++++++- src/net/JobResult.h | 21 ++++++++++++++++---- src/workers/MultiWorker.cpp | 2 +- 12 files changed, 114 insertions(+), 33 deletions(-) diff --git a/src/common/Platform.cpp b/src/common/Platform.cpp index a95f78e7..17fcc38e 100644 --- a/src/common/Platform.cpp +++ b/src/common/Platform.cpp @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh + * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -36,7 +37,7 @@ char Platform::m_defaultConfigName[520] = { 0 }; -xmrig::c_str Platform::m_userAgent; +xmrig::String Platform::m_userAgent; const char *Platform::defaultConfigName() diff --git a/src/common/Platform.h b/src/common/Platform.h index 5dfb9ff7..fc10e83b 100644 --- a/src/common/Platform.h +++ b/src/common/Platform.h @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh + * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -28,7 +29,7 @@ #include -#include "common/utils/c_str.h" +#include "base/tools/String.h" class Platform @@ -36,7 +37,9 @@ class Platform public: static bool setThreadAffinity(uint64_t cpu_id); static const char *defaultConfigName(); + static uint32_t setTimerResolution(uint32_t resolution); static void init(const char *userAgent); + static void restoreTimerResolution(); static void setProcessPriority(int priority); static void setThreadPriority(int priority); @@ -46,7 +49,7 @@ private: static char *createUserAgent(); static char m_defaultConfigName[520]; - static xmrig::c_str m_userAgent; + static xmrig::String m_userAgent; }; diff --git a/src/common/Platform_mac.cpp b/src/common/Platform_mac.cpp index d0c533b0..b587c01f 100644 --- a/src/common/Platform_mac.cpp +++ b/src/common/Platform_mac.cpp @@ -65,9 +65,19 @@ bool Platform::setThreadAffinity(uint64_t cpu_id) } +uint32_t Platform::setTimerResolution(uint32_t resolution) +{ + return resolution; +} + + +void Platform::restoreTimerResolution() +{ +} + + void Platform::setProcessPriority(int priority) { - } diff --git a/src/common/Platform_unix.cpp b/src/common/Platform_unix.cpp index 058920ec..1263b846 100644 --- a/src/common/Platform_unix.cpp +++ b/src/common/Platform_unix.cpp @@ -92,6 +92,17 @@ bool Platform::setThreadAffinity(uint64_t cpu_id) } +uint32_t Platform::setTimerResolution(uint32_t resolution) +{ + return resolution; +} + + +void Platform::restoreTimerResolution() +{ +} + + void Platform::setProcessPriority(int priority) { } diff --git a/src/common/Platform_win.cpp b/src/common/Platform_win.cpp index 32b850d1..d220f58d 100644 --- a/src/common/Platform_win.cpp +++ b/src/common/Platform_win.cpp @@ -4,8 +4,9 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 SChernykh + * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,6 +23,7 @@ */ +#include #include #include #include @@ -37,6 +39,9 @@ #endif +static uint32_t timerResolution = 0; + + static inline OSVERSIONINFOEX winOsVersion() { typedef NTSTATUS (NTAPI *RtlGetVersionFunction)(LPOSVERSIONINFO); @@ -94,6 +99,34 @@ bool Platform::setThreadAffinity(uint64_t cpu_id) } +uint32_t Platform::setTimerResolution(uint32_t resolution) +{ +# ifdef XMRIG_AMD_PROJECT + TIMECAPS tc; + + if (timeGetDevCaps(&tc, sizeof(TIMECAPS)) != TIMERR_NOERROR) { + return 0; + } + + timerResolution = std::min(std::max(tc.wPeriodMin, resolution), tc.wPeriodMax); + + return timeBeginPeriod(timerResolution) == TIMERR_NOERROR ? timerResolution : 0; +# else + return resolution; +# endif +} + + +void Platform::restoreTimerResolution() +{ +# ifdef XMRIG_AMD_PROJECT + if (timerResolution) { + timeEndPeriod(timerResolution); + } +# endif +} + + void Platform::setProcessPriority(int priority) { if (priority == -1) { @@ -121,6 +154,7 @@ void Platform::setProcessPriority(int priority) case 5: prio = REALTIME_PRIORITY_CLASS; + break; default: break; diff --git a/src/common/net/Client.cpp b/src/common/net/Client.cpp index 1d1d86c7..c0ab50b1 100644 --- a/src/common/net/Client.cpp +++ b/src/common/net/Client.cpp @@ -212,6 +212,10 @@ const char *Client::tlsVersion() const int64_t Client::submit(const JobResult &result) { + if (result.clientId != m_rpcId) { + return -1; + } + using namespace rapidjson; # ifdef XMRIG_PROXY_PROJECT @@ -355,6 +359,8 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code) return false; } + m_job.setClientId(m_rpcId); + if (m_job != job) { m_jobs++; m_job = std::move(job); diff --git a/src/common/net/Id.h b/src/common/net/Id.h index 5fb2db52..4b13e793 100644 --- a/src/common/net/Id.h +++ b/src/common/net/Id.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __ID_H__ -#define __ID_H__ +#ifndef XMRIG_ID_H +#define XMRIG_ID_H #include @@ -95,4 +95,4 @@ private: } /* namespace xmrig */ -#endif /* __ID_H__ */ +#endif /* XMRIG_ID_H */ diff --git a/src/common/net/Job.cpp b/src/common/net/Job.cpp index 2bfb39f0..6d3ee993 100644 --- a/src/common/net/Job.cpp +++ b/src/common/net/Job.cpp @@ -91,6 +91,12 @@ Job::~Job() } +bool Job::isEqual(const Job &other) const +{ + return m_id == other.m_id && m_clientId == other.m_clientId && memcmp(m_blob, other.m_blob, sizeof(m_blob)) == 0; +} + + bool Job::setBlob(const char *blob) { if (!blob) { @@ -214,18 +220,6 @@ char *Job::toHex(const unsigned char* in, unsigned int len) #endif -bool Job::operator==(const Job &other) const -{ - return m_id == other.m_id && memcmp(m_blob, other.m_blob, sizeof(m_blob)) == 0; -} - - -bool Job::operator!=(const Job &other) const -{ - return m_id != other.m_id || memcmp(m_blob, other.m_blob, sizeof(m_blob)) != 0; -} - - xmrig::Variant Job::variant() const { using namespace xmrig; diff --git a/src/common/net/Job.h b/src/common/net/Job.h index b561b9c1..51e3428e 100644 --- a/src/common/net/Job.h +++ b/src/common/net/Job.h @@ -42,6 +42,7 @@ public: Job(int poolId, bool nicehash, const xmrig::Algorithm &algorithm, const xmrig::Id &clientId); ~Job(); + bool isEqual(const Job &other) const; bool setBlob(const char *blob); bool setTarget(const char *target); void setAlgorithm(const char *algo); @@ -81,8 +82,8 @@ public: static char *toHex(const unsigned char* in, unsigned int len); # endif - bool operator==(const Job &other) const; - bool operator!=(const Job &other) const; + inline bool operator==(const Job &other) const { return isEqual(other); } + inline bool operator!=(const Job &other) const { return !isEqual(other); } private: xmrig::Variant variant() const; diff --git a/src/common/utils/timestamp.h b/src/common/utils/timestamp.h index 6b6a8ab2..7fc4ab50 100644 --- a/src/common/utils/timestamp.h +++ b/src/common/utils/timestamp.h @@ -31,7 +31,7 @@ namespace xmrig { -static inline int64_t currentMSecsSinceEpoch() +static inline int64_t steadyTimestamp() { using namespace std::chrono; if (high_resolution_clock::is_steady) { @@ -42,6 +42,14 @@ static inline int64_t currentMSecsSinceEpoch() } +static inline int64_t currentMSecsSinceEpoch() +{ + using namespace std::chrono; + + return time_point_cast(high_resolution_clock::now()).time_since_epoch().count(); +} + + } /* namespace xmrig */ #endif /* XMRIG_TIMESTAMP_H */ diff --git a/src/net/JobResult.h b/src/net/JobResult.h index 4a920ca0..071cbb8b 100644 --- a/src/net/JobResult.h +++ b/src/net/JobResult.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __JOBRESULT_H__ -#define __JOBRESULT_H__ +#ifndef XMRIG_JOBRESULT_H +#define XMRIG_JOBRESULT_H #include @@ -36,17 +36,29 @@ class JobResult { public: inline JobResult() : poolId(0), diff(0), nonce(0) {} - inline JobResult(int poolId, const xmrig::Id &jobId, uint32_t nonce, const uint8_t *result, uint32_t diff, const xmrig::Algorithm &algorithm) : + inline JobResult(int poolId, const xmrig::Id &jobId, const xmrig::Id &clientId, uint32_t nonce, const uint8_t *result, uint32_t diff, const xmrig::Algorithm &algorithm) : poolId(poolId), diff(diff), nonce(nonce), algorithm(algorithm), + clientId(clientId), jobId(jobId) { memcpy(this->result, result, sizeof(this->result)); } + inline JobResult(const Job &job) : poolId(0), diff(0), nonce(0) + { + jobId = job.id(); + clientId = job.clientId(); + poolId = job.poolId(); + diff = job.diff(); + nonce = *job.nonce(); + algorithm = job.algorithm(); + } + + inline uint64_t actualDiff() const { return Job::toDiff(reinterpret_cast(result)[3]); @@ -58,7 +70,8 @@ public: uint32_t nonce; uint8_t result[32]; xmrig::Algorithm algorithm; + xmrig::Id clientId; xmrig::Id jobId; }; -#endif /* __JOBRESULT_H__ */ +#endif /* XMRIG_JOBRESULT_H */ diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index a6dbc73a..645259b6 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -108,7 +108,7 @@ void MultiWorker::start() for (size_t i = 0; i < N; ++i) { if (*reinterpret_cast(m_hash + (i * 32) + 24) < m_state.job.target()) { - Workers::submit(JobResult(m_state.job.poolId(), m_state.job.id(), *nonce(i), m_hash + (i * 32), m_state.job.diff(), m_state.job.algorithm())); + Workers::submit(JobResult(m_state.job.poolId(), m_state.job.id(), m_state.job.clientId(), *nonce(i), m_hash + (i * 32), m_state.job.diff(), m_state.job.algorithm())); } *nonce(i) += 1; From c06f77b9e9233c70b0614e7bfbdcb6404d700c82 Mon Sep 17 00:00:00 2001 From: XMRig Date: Tue, 20 Nov 2018 08:18:39 +0700 Subject: [PATCH 12/26] Better compiler name and version handling on Linux and macOS for user-agent string. --- src/common/Platform_mac.cpp | 16 +++++++--------- src/common/Platform_unix.cpp | 8 +++++--- src/common/Platform_win.cpp | 4 ++-- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/common/Platform_mac.cpp b/src/common/Platform_mac.cpp index 3855a6e3..4e4aa0ad 100644 --- a/src/common/Platform_mac.cpp +++ b/src/common/Platform_mac.cpp @@ -40,22 +40,20 @@ char *Platform::createUserAgent() { - const size_t max = 160; + constexpr const size_t max = 256; - char *buf = new char[max]; + char *buf = new char[max](); + int length = snprintf(buf, max, "%s/%s (Macintosh; Intel Mac OS X) libuv/%s", APP_NAME, APP_VERSION, uv_version_string()); # ifdef XMRIG_NVIDIA_PROJECT const int cudaVersion = cuda_get_runtime_version(); - snprintf(buf, max, "%s/%s (Macintosh; Intel Mac OS X) libuv/%s CUDA/%d.%d", APP_NAME, APP_VERSION, uv_version_string(), cudaVersion / 1000, cudaVersion % 100); -# else - snprintf(buf, max, "%s/%s (Macintosh; Intel Mac OS X) libuv/%s", APP_NAME, APP_VERSION, uv_version_string()); + length += snprintf(buf + length, max - length, " CUDA/%d.%d", cudaVersion / 1000, cudaVersion % 100); # endif + # ifdef __clang__ - size_t i = strlen(buf); - snprintf(buf + i, max - i, " clang/%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__); + length += snprintf(buf + length, max - length, " clang/%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__); # elif defined(__GNUC__) - size_t i = strlen(buf); - snprintf(buf + i, max - i, " gcc/%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); + length += snprintf(buf + length, max - length, " gcc/%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); # endif return buf; diff --git a/src/common/Platform_unix.cpp b/src/common/Platform_unix.cpp index 1263b846..901df4be 100644 --- a/src/common/Platform_unix.cpp +++ b/src/common/Platform_unix.cpp @@ -54,9 +54,9 @@ typedef cpuset_t cpu_set_t; char *Platform::createUserAgent() { - const size_t max = 160; + constexpr const size_t max = 256; - char *buf = new char[max]; + char *buf = new char[max](); int length = snprintf(buf, max, "%s/%s (Linux ", APP_NAME, APP_VERSION); # if defined(__x86_64__) @@ -70,7 +70,9 @@ char *Platform::createUserAgent() length += snprintf(buf + length, max - length, " CUDA/%d.%d", cudaVersion / 1000, cudaVersion % 100); # endif -# ifdef __GNUC__ +# ifdef __clang__ + length += snprintf(buf + length, max - length, " clang/%d.%d.%d", __clang_major__, __clang_minor__, __clang_patchlevel__); +# elif defined(__GNUC__) length += snprintf(buf + length, max - length, " gcc/%d.%d.%d", __GNUC__, __GNUC_MINOR__, __GNUC_PATCHLEVEL__); # endif diff --git a/src/common/Platform_win.cpp b/src/common/Platform_win.cpp index d220f58d..3fa7ea9a 100644 --- a/src/common/Platform_win.cpp +++ b/src/common/Platform_win.cpp @@ -63,9 +63,9 @@ static inline OSVERSIONINFOEX winOsVersion() char *Platform::createUserAgent() { const auto osver = winOsVersion(); - const size_t max = 160; + constexpr const size_t max = 256; - char *buf = new char[max]; + char *buf = new char[max](); int length = snprintf(buf, max, "%s/%s (Windows NT %lu.%lu", APP_NAME, APP_VERSION, osver.dwMajorVersion, osver.dwMinorVersion); # if defined(__x86_64__) || defined(_M_AMD64) From cfe3995aa8d0ad35bf06ca7ae76f61351f53a58d Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 9 Jan 2019 16:43:36 +0700 Subject: [PATCH 13/26] Sync changes. --- src/base/tools/String.h | 3 ++- src/common/Platform_win.cpp | 4 +++- src/common/config/ConfigLoader.cpp | 2 +- src/common/config/ConfigLoader.h | 2 +- src/common/net/Client.cpp | 4 +++- src/common/net/Client.h | 2 +- src/common/net/Id.h | 2 +- src/common/net/Job.cpp | 2 +- src/common/net/Job.h | 2 +- src/version.h | 4 ++-- 10 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/base/tools/String.h b/src/base/tools/String.h index b2da0940..0c191dfd 100644 --- a/src/base/tools/String.h +++ b/src/base/tools/String.h @@ -60,7 +60,7 @@ public: bool isEqual(const String &other) const; - inline bool contains(const char *str) const { return strstr(m_data, str) != nullptr; } + inline bool contains(const char *str) const { return isNull() ? false : strstr(m_data, str) != nullptr; } inline bool isEmpty() const { return size() == 0; } @@ -75,6 +75,7 @@ public: inline bool operator<(const String &str) const { return strcmp(data(), str.data()) < 0; } inline bool operator==(const char *str) const { return isEqual(str); } inline bool operator==(const String &other) const { return isEqual(other); } + inline operator const char*() const { return m_data; } inline String &operator=(char *str) { move(str); return *this; } inline String &operator=(const char *str) { copy(str); return *this; } inline String &operator=(const String &str) { copy(str); return *this; } diff --git a/src/common/Platform_win.cpp b/src/common/Platform_win.cpp index 3fa7ea9a..9e9b772d 100644 --- a/src/common/Platform_win.cpp +++ b/src/common/Platform_win.cpp @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -39,7 +39,9 @@ #endif +#ifdef XMRIG_AMD_PROJECT static uint32_t timerResolution = 0; +#endif static inline OSVERSIONINFOEX winOsVersion() diff --git a/src/common/config/ConfigLoader.cpp b/src/common/config/ConfigLoader.cpp index 0365e151..b3b3ecb0 100644 --- a/src/common/config/ConfigLoader.cpp +++ b/src/common/config/ConfigLoader.cpp @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/common/config/ConfigLoader.h b/src/common/config/ConfigLoader.h index 840080f9..b9e04537 100644 --- a/src/common/config/ConfigLoader.h +++ b/src/common/config/ConfigLoader.h @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/common/net/Client.cpp b/src/common/net/Client.cpp index c0ab50b1..8458b1e2 100644 --- a/src/common/net/Client.cpp +++ b/src/common/net/Client.cpp @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -212,9 +212,11 @@ const char *Client::tlsVersion() const int64_t Client::submit(const JobResult &result) { +# ifndef XMRIG_PROXY_PROJECT if (result.clientId != m_rpcId) { return -1; } +# endif using namespace rapidjson; diff --git a/src/common/net/Client.h b/src/common/net/Client.h index d6418338..a05710fc 100644 --- a/src/common/net/Client.h +++ b/src/common/net/Client.h @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/common/net/Id.h b/src/common/net/Id.h index 4b13e793..999e7837 100644 --- a/src/common/net/Id.h +++ b/src/common/net/Id.h @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/common/net/Job.cpp b/src/common/net/Job.cpp index 6d3ee993..acb3b3f4 100644 --- a/src/common/net/Job.cpp +++ b/src/common/net/Job.cpp @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/common/net/Job.h b/src/common/net/Job.h index 51e3428e..394727df 100644 --- a/src/common/net/Job.h +++ b/src/common/net/Job.h @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/version.h b/src/version.h index 4f929666..05748d19 100644 --- a/src/version.h +++ b/src/version.h @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -30,7 +30,7 @@ #define APP_VERSION "2.8.5-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" -#define APP_COPYRIGHT "Copyright (C) 2016-2018 xmrig.com" +#define APP_COPYRIGHT "Copyright (C) 2016-2019 xmrig.com" #define APP_KIND "cpu" #define APP_VER_MAJOR 2 From 16b4fd0ff5a2734a0a7defc2efe8e3c1e7bf92c0 Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 9 Jan 2019 21:47:03 +0700 Subject: [PATCH 14/26] Update variant detection for nicehash.com and minergate.com. --- src/common/net/Pool.cpp | 36 ++++++++++++++++++++++++++---------- src/common/net/Pool.h | 2 +- 2 files changed, 27 insertions(+), 11 deletions(-) diff --git a/src/common/net/Pool.cpp b/src/common/net/Pool.cpp index 089a8727..b9a50ae8 100644 --- a/src/common/net/Pool.cpp +++ b/src/common/net/Pool.cpp @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -322,23 +322,39 @@ void Pool::adjustVariant(const xmrig::Variant variantHint) m_nicehash = true; bool valid = true; - if (m_host.contains("cryptonight.") && m_port == 3355) { - valid = m_algorithm.algo() == CRYPTONIGHT; + switch (m_port) { + case 3355: + case 33355: + valid = m_algorithm.algo() == CRYPTONIGHT && m_host.contains("cryptonight."); m_algorithm.setVariant(VARIANT_0); - } - else if (m_host.contains("cryptonightv7.") && m_port == 3363) { - valid = m_algorithm.algo() == CRYPTONIGHT; + break; + + case 3363: + case 33363: + valid = m_algorithm.algo() == CRYPTONIGHT && m_host.contains("cryptonightv7."); m_algorithm.setVariant(VARIANT_1); - } - else if (m_host.contains("cryptonightheavy.") && m_port == 3364) { - valid = m_algorithm.algo() == CRYPTONIGHT_HEAVY; + break; + + case 3364: + valid = m_algorithm.algo() == CRYPTONIGHT_HEAVY && m_host.contains("cryptonightheavy."); m_algorithm.setVariant(VARIANT_0); + break; + + case 3367: + case 33367: + valid = m_algorithm.algo() == CRYPTONIGHT && m_host.contains("cryptonightv8."); + m_algorithm.setVariant(VARIANT_2); + break; + + default: + break; } if (!valid) { m_algorithm.setAlgo(INVALID_ALGO); } + m_tls = m_port > 33000; return; } @@ -349,7 +365,7 @@ void Pool::adjustVariant(const xmrig::Variant variantHint) if (m_host.contains("xmr.pool.")) { valid = m_algorithm.algo() == CRYPTONIGHT; - m_algorithm.setVariant(m_port == 45700 ? VARIANT_2 : VARIANT_0); + m_algorithm.setVariant(m_port == 45700 ? VARIANT_AUTO : VARIANT_0); } else if (m_host.contains("aeon.pool.") && m_port == 45690) { valid = m_algorithm.algo() == CRYPTONIGHT_LITE; diff --git a/src/common/net/Pool.h b/src/common/net/Pool.h index 123cc131..c051b0ee 100644 --- a/src/common/net/Pool.h +++ b/src/common/net/Pool.h @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by From 84c2cadc5045429cf37416753900d8fe774e4c38 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 13 Jan 2019 15:03:36 +0700 Subject: [PATCH 15/26] #899 Initial support for "cn/xtlv9" (C++ only). --- src/common/crypto/Algorithm.cpp | 7 +- src/common/crypto/Algorithm.h | 2 +- src/common/net/Job.cpp | 3 + src/common/net/Pool.cpp | 1 + src/common/xmrig.h | 3 +- src/crypto/CryptoNight_constants.h | 6 ++ src/crypto/CryptoNight_monero.h | 10 +-- src/crypto/CryptoNight_test.h | 44 ++++++++---- src/crypto/CryptoNight_x86.h | 108 +++++++++++++++-------------- src/workers/CpuThread.cpp | 17 ++++- src/workers/CpuThread.h | 2 +- src/workers/MultiWorker.cpp | 17 ++--- src/workers/MultiWorker.h | 2 +- 13 files changed, 140 insertions(+), 82 deletions(-) diff --git a/src/common/crypto/Algorithm.cpp b/src/common/crypto/Algorithm.cpp index a3cf48b2..d8131ceb 100644 --- a/src/common/crypto/Algorithm.cpp +++ b/src/common/crypto/Algorithm.cpp @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -62,6 +62,7 @@ static AlgoData const algorithms[] = { { "cryptonight/xao", "cn/xao", xmrig::CRYPTONIGHT, xmrig::VARIANT_XAO }, { "cryptonight/rto", "cn/rto", xmrig::CRYPTONIGHT, xmrig::VARIANT_RTO }, { "cryptonight/2", "cn/2", xmrig::CRYPTONIGHT, xmrig::VARIANT_2 }, + { "cryptonight/xtlv9", "cn/xtlv9", xmrig::CRYPTONIGHT, xmrig::VARIANT_XTL2 }, # ifndef XMRIG_NO_AEON { "cryptonight-lite", "cn-lite", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO }, @@ -109,9 +110,13 @@ static const char *variants[] = { "xao", "rto", "2", + "xtlv9" }; +static_assert(xmrig::VARIANT_MAX == ARRAY_SIZE(variants), "variants size mismatch"); + + bool xmrig::Algorithm::isValid() const { if (m_algo == INVALID_ALGO) { diff --git a/src/common/crypto/Algorithm.h b/src/common/crypto/Algorithm.h index 731fa793..4a975ad1 100644 --- a/src/common/crypto/Algorithm.h +++ b/src/common/crypto/Algorithm.h @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/common/net/Job.cpp b/src/common/net/Job.cpp index acb3b3f4..a0108d6a 100644 --- a/src/common/net/Job.cpp +++ b/src/common/net/Job.cpp @@ -124,6 +124,9 @@ bool Job::setBlob(const char *blob) if (m_autoVariant) { m_algorithm.setVariant(variant()); } + else if (m_algorithm.variant() == xmrig::VARIANT_XTL && m_blob[0] >= 9) { + m_algorithm.setVariant(xmrig::VARIANT_XTL2); + } # ifdef XMRIG_PROXY_PROJECT memset(m_rawBlob, 0, sizeof(m_rawBlob)); diff --git a/src/common/net/Pool.cpp b/src/common/net/Pool.cpp index b9a50ae8..617a03b9 100644 --- a/src/common/net/Pool.cpp +++ b/src/common/net/Pool.cpp @@ -412,6 +412,7 @@ void Pool::rebuild() addVariant(xmrig::VARIANT_2); addVariant(xmrig::VARIANT_1); addVariant(xmrig::VARIANT_0); + addVariant(xmrig::VARIANT_XTL2); addVariant(xmrig::VARIANT_XTL); addVariant(xmrig::VARIANT_TUBE); addVariant(xmrig::VARIANT_MSR); diff --git a/src/common/xmrig.h b/src/common/xmrig.h index 20306d1c..b60c4bd7 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -69,6 +69,7 @@ enum Variant { VARIANT_XAO = 6, // Modified CryptoNight variant 0 (Alloy only) VARIANT_RTO = 7, // Modified CryptoNight variant 1 (Arto only) VARIANT_2 = 8, // CryptoNight variant 2 + VARIANT_XTL2 = 9, VARIANT_MAX }; diff --git a/src/crypto/CryptoNight_constants.h b/src/crypto/CryptoNight_constants.h index f13891a7..74f725ee 100644 --- a/src/crypto/CryptoNight_constants.h +++ b/src/crypto/CryptoNight_constants.h @@ -40,6 +40,7 @@ constexpr const uint32_t CRYPTONIGHT_MASK = 0x1FFFF0; constexpr const uint32_t CRYPTONIGHT_ITER = 0x80000; constexpr const uint32_t CRYPTONIGHT_MSR_ITER = 0x40000; constexpr const uint32_t CRYPTONIGHT_XAO_ITER = 0x100000; +constexpr const uint32_t CRYPTONIGHT_XTL2_ITER = 0x40000; constexpr const size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024; constexpr const uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0; @@ -109,6 +110,7 @@ template<> inline constexpr uint32_t cn_select_iter() template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } +template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_XTL2_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_MSR_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_XAO_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } @@ -128,6 +130,9 @@ inline uint32_t cn_select_iter(Algo algorithm, Variant variant) case VARIANT_RTO: return CRYPTONIGHT_XAO_ITER; + case VARIANT_XTL2: + return CRYPTONIGHT_XTL2_ITER; + default: break; } @@ -161,6 +166,7 @@ template<> inline constexpr Variant cn_base_variant() { return VA template<> inline constexpr Variant cn_base_variant() { return VARIANT_0; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_1; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } +template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } } /* namespace xmrig */ diff --git a/src/crypto/CryptoNight_monero.h b/src/crypto/CryptoNight_monero.h index 52229026..966e516f 100644 --- a/src/crypto/CryptoNight_monero.h +++ b/src/crypto/CryptoNight_monero.h @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -33,21 +33,21 @@ #ifndef XMRIG_ARM # define VARIANT1_INIT(part) \ uint64_t tweak1_2_##part = 0; \ - if (IS_V1) { \ + if (BASE == xmrig::VARIANT_1) { \ tweak1_2_##part = (*reinterpret_cast(input + 35 + part * size) ^ \ *(reinterpret_cast(ctx[part]->state) + 24)); \ } #else # define VARIANT1_INIT(part) \ uint64_t tweak1_2_##part = 0; \ - if (IS_V1) { \ + if (BASE == xmrig::VARIANT_1) { \ memcpy(&tweak1_2_##part, input + 35 + part * size, sizeof tweak1_2_##part); \ tweak1_2_##part ^= *(reinterpret_cast(ctx[part]->state) + 24); \ } #endif #define VARIANT1_1(p) \ - if (IS_V1) { \ + if (BASE == xmrig::VARIANT_1) { \ const uint8_t tmp = reinterpret_cast(p)[11]; \ static const uint32_t table = 0x75310; \ const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ @@ -55,7 +55,7 @@ } #define VARIANT1_2(p, part) \ - if (IS_V1) { \ + if (BASE == xmrig::VARIANT_1) { \ (p) ^= tweak1_2_##part; \ } diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 95e12197..dcbd85d2 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,8 +22,8 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_TEST_H__ -#define __CRYPTONIGHT_TEST_H__ +#ifndef XMRIG_CRYPTONIGHT_TEST_H +#define XMRIG_CRYPTONIGHT_TEST_H const static uint8_t test_input[380] = { @@ -55,6 +55,7 @@ const static uint8_t test_input[380] = { }; +// "cn/0" const static uint8_t test_output_v0[160] = { 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, @@ -69,7 +70,7 @@ const static uint8_t test_output_v0[160] = { }; -// Cryptonight variant 1 (Monero v7) +// "cn/1" Cryptonight variant 1 (Monero v7) const static uint8_t test_output_v1[160] = { 0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, 0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9, @@ -84,7 +85,7 @@ const static uint8_t test_output_v1[160] = { }; -// Cryptonight variant 2 (Monero v8) +// "cn/2" Cryptonight variant 2 (Monero v8) const static uint8_t test_output_v2[160] = { 0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14, 0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21, @@ -99,7 +100,7 @@ const static uint8_t test_output_v2[160] = { }; -// Stellite (XTL) +// "cn/xtl" Stellite (XTL) const static uint8_t test_output_xtl[160] = { 0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3, 0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1, @@ -114,7 +115,22 @@ const static uint8_t test_output_xtl[160] = { }; -// Masari (MSR) +// "cn/xtl2" +const static uint8_t test_output_xtl2[160] = { + 0x5D, 0x4F, 0xBC, 0x35, 0x60, 0x97, 0xEA, 0x64, 0x40, 0xB0, 0x88, 0x8E, 0xDE, 0xB6, 0x35, 0xDD, + 0xC8, 0x4A, 0x0E, 0x39, 0x7C, 0x86, 0x84, 0x56, 0x89, 0x5C, 0x3F, 0x29, 0xBE, 0x73, 0x12, 0xA7, + 0x02, 0xE6, 0x1D, 0x2B, 0xBC, 0x84, 0xB6, 0x71, 0x96, 0x71, 0xD5, 0x0C, 0xAC, 0x76, 0x0E, 0x6B, + 0xF1, 0xF0, 0x55, 0x34, 0x15, 0x29, 0x93, 0x04, 0x2D, 0xED, 0xD2, 0x33, 0x50, 0x6E, 0xBE, 0x25, + 0xD0, 0xFD, 0x8E, 0xC6, 0x15, 0xD5, 0x12, 0x53, 0x7B, 0x26, 0xF6, 0x01, 0xA5, 0xA8, 0xBE, 0x7C, + 0xCF, 0x5E, 0x19, 0xB7, 0x63, 0x0D, 0x0F, 0x02, 0x2B, 0xD7, 0xC4, 0x8C, 0x12, 0x24, 0x80, 0x02, + 0xE7, 0xB7, 0xA0, 0x4F, 0x94, 0xF9, 0x46, 0xB5, 0x18, 0x64, 0x7E, 0x4E, 0x9C, 0x81, 0x6C, 0x60, + 0x7D, 0x2E, 0xEA, 0xCF, 0x90, 0xCB, 0x68, 0x09, 0xC9, 0x53, 0xF6, 0xA9, 0xCA, 0x0C, 0xAC, 0xDC, + 0xFD, 0x07, 0xDA, 0x24, 0x1D, 0xD1, 0x35, 0x32, 0x3C, 0xE8, 0x64, 0x44, 0x5E, 0xCB, 0xB5, 0x00, + 0x69, 0xF4, 0x6F, 0xBB, 0x62, 0x0D, 0x25, 0xD8, 0xAC, 0x20, 0x90, 0xC5, 0x1B, 0xD3, 0x5F, 0xCA +}; + + +// "cn/msr" Masari (MSR) const static uint8_t test_output_msr[160] = { 0x3C, 0x7A, 0x61, 0x08, 0x4C, 0x5E, 0xB8, 0x65, 0xB4, 0x98, 0xAB, 0x2F, 0x5A, 0x1A, 0xC5, 0x2C, 0x49, 0xC1, 0x77, 0xC2, 0xD0, 0x13, 0x34, 0x42, 0xD6, 0x5E, 0xD5, 0x14, 0x33, 0x5C, 0x82, 0xC5, @@ -129,7 +145,7 @@ const static uint8_t test_output_msr[160] = { }; -// Alloy (XAO) +// "cn/xao" Alloy (XAO) const static uint8_t test_output_xao[160] = { 0x9A, 0x29, 0xD0, 0xC4, 0xAF, 0xDC, 0x63, 0x9B, 0x65, 0x53, 0xB1, 0xC8, 0x37, 0x35, 0x11, 0x4C, 0x5D, 0x77, 0x16, 0x21, 0x42, 0x97, 0x5C, 0xB8, 0x50, 0xC0, 0xA5, 0x1F, 0x64, 0x07, 0xBD, 0x33, @@ -144,7 +160,7 @@ const static uint8_t test_output_xao[160] = { }; -// Arto (RTO) +// "cn/rto" Arto (RTO) const static uint8_t test_output_rto[160] = { 0x82, 0x66, 0x1E, 0x1C, 0x6E, 0x64, 0x36, 0x66, 0x84, 0x06, 0x32, 0x7A, 0x9B, 0xB1, 0x13, 0x19, 0xA5, 0x56, 0x16, 0x15, 0xDF, 0xEC, 0x1C, 0x9E, 0xE3, 0x88, 0x4A, 0x6C, 0x1C, 0xEB, 0x76, 0xA5, @@ -160,6 +176,7 @@ const static uint8_t test_output_rto[160] = { #ifndef XMRIG_NO_AEON +// "cn-lite/0" const static uint8_t test_output_v0_lite[160] = { 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, @@ -174,7 +191,7 @@ const static uint8_t test_output_v0_lite[160] = { }; -// AEON v7 +// "cn-lite/1" AEON v7 const static uint8_t test_output_v1_lite[160] = { 0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22, 0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41, @@ -191,6 +208,7 @@ const static uint8_t test_output_v1_lite[160] = { #ifndef XMRIG_NO_SUMO +// "cn-heavy/0" const static uint8_t test_output_v0_heavy[160] = { 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, 0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2, @@ -204,6 +222,8 @@ const static uint8_t test_output_v0_heavy[160] = { 0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB }; + +// "cn-heavy/xhv" const static uint8_t test_output_xhv_heavy[160] = { 0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57, 0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6, @@ -218,7 +238,7 @@ const static uint8_t test_output_xhv_heavy[160] = { }; -// TUBE +// "cn-heavy/tube" const static uint8_t test_output_tube_heavy[160] = { 0xFE, 0x53, 0x35, 0x20, 0x76, 0xEA, 0xE6, 0x89, 0xFA, 0x3B, 0x4F, 0xDA, 0x61, 0x46, 0x34, 0xCF, 0xC3, 0x12, 0xEE, 0x0C, 0x38, 0x7D, 0xF2, 0xB8, 0xB7, 0x4D, 0xA2, 0xA1, 0x59, 0x74, 0x12, 0x35, @@ -234,4 +254,4 @@ const static uint8_t test_output_tube_heavy[160] = { #endif -#endif /* __CRYPTONIGHT_TEST_H__ */ +#endif /* XMRIG_CRYPTONIGHT_TEST_H */ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index dfcd1296..f717d035 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -427,10 +427,10 @@ static inline __m128i int_sqrt_v2(const uint64_t n0) } -template +template static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i cx) { - if (VARIANT == xmrig::VARIANT_2) { + if (BASE == xmrig::VARIANT_2) { VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1); _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx)); } else { @@ -453,12 +453,12 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, const uint8_t* l, template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { - constexpr size_t MASK = xmrig::cn_select_mask(); - constexpr size_t ITERATIONS = xmrig::cn_select_iter(); - constexpr size_t MEM = xmrig::cn_select_memory(); - constexpr bool IS_V1 = xmrig::cn_base_variant() == xmrig::VARIANT_1; + constexpr size_t MASK = xmrig::cn_select_mask(); + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + constexpr xmrig::Variant BASE = xmrig::cn_base_variant(); - if (IS_V1 && size < 43) { + if (BASE == xmrig::VARIANT_1 && size < 43) { memset(output, 0, 32); return; } @@ -498,8 +498,8 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si cx = _mm_aesenc_si128(cx, ax0); } - if (IS_V1 || VARIANT == xmrig::VARIANT_2) { - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx0, bx1, cx); + if (BASE == xmrig::VARIANT_1 || BASE == xmrig::VARIANT_2) { + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx0, bx1, cx); } else { _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); } @@ -509,7 +509,8 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l0[idx0 & MASK])[0]; ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { VARIANT2_INTEGER_MATH(0, cl, cx); lo = __umul128(idx0, cl, &hi); VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); @@ -523,9 +524,9 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - if (IS_V1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { + if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0; - } else if (IS_V1) { + } else if (BASE == xmrig::VARIANT_1) { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; } else { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; @@ -548,9 +549,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si idx0 = d ^ q; } - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { bx1 = bx0; } + bx0 = cx; } @@ -620,12 +623,12 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { - constexpr size_t MASK = xmrig::cn_select_mask(); - constexpr size_t ITERATIONS = xmrig::cn_select_iter(); - constexpr size_t MEM = xmrig::cn_select_memory(); - constexpr bool IS_V1 = xmrig::cn_base_variant() == xmrig::VARIANT_1; + constexpr size_t MASK = xmrig::cn_select_mask(); + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + constexpr xmrig::Variant BASE = xmrig::cn_base_variant(); - if (IS_V1 && size < 43) { + if (BASE == xmrig::VARIANT_1 && size < 43) { memset(output, 0, 64); return; } @@ -682,9 +685,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si cx1 = _mm_aesenc_si128(cx1, ax1); } - if (IS_V1 || (VARIANT == xmrig::VARIANT_2)) { - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx00, bx01, cx0); - cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], l1, idx1 & MASK, ax1, bx10, bx11, cx1); + if (BASE == xmrig::VARIANT_1 || (BASE == xmrig::VARIANT_2)) { + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx00, bx01, cx0); + cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], l1, idx1 & MASK, ax1, bx10, bx11, cx1); } else { _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx10, cx1)); @@ -696,7 +699,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l0[idx0 & MASK])[0]; ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { VARIANT2_INTEGER_MATH(0, cl, cx0); lo = __umul128(idx0, cl, &hi); VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); @@ -709,9 +713,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - if (IS_V1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { + if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { ((uint64_t*) &l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0; - } else if (IS_V1) { + } else if (BASE == xmrig::VARIANT_1) { ((uint64_t*) &l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; } else { ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; @@ -737,7 +741,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { VARIANT2_INTEGER_MATH(1, cl, cx1); lo = __umul128(idx1, cl, &hi); VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); @@ -750,9 +755,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ((uint64_t*)&l1[idx1 & MASK])[0] = al1; - if (IS_V1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { + if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1 ^ al1; - } else if (IS_V1) { + } else if (BASE == xmrig::VARIANT_1) { ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1; } else { ((uint64_t*)&l1[idx1 & MASK])[1] = ah1; @@ -776,10 +781,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si idx1 = d ^ q; } - if (VARIANT == xmrig::VARIANT_2) { + if (BASE == xmrig::VARIANT_2) { bx01 = bx00; bx11 = bx10; } + bx00 = cx0; bx10 = cx1; } @@ -810,8 +816,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si c = _mm_aesenc_si128(c, a); \ } \ \ - if (IS_V1 || (VARIANT == xmrig::VARIANT_2)) { \ - cryptonight_monero_tweak((uint64_t*)ptr, l, idx & MASK, a, b0, b1, c); \ + if (BASE == xmrig::VARIANT_1 || BASE == xmrig::VARIANT_2) { \ + cryptonight_monero_tweak((uint64_t*)ptr, l, idx & MASK, a, b0, b1, c); \ } else { \ _mm_store_si128(ptr, _mm_xor_si128(b0, c)); \ } @@ -825,7 +831,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si #define CN_STEP4(part, a, b0, b1, c, l, mc, ptr, idx) \ - if (VARIANT == xmrig::VARIANT_2) { \ + if (BASE == xmrig::VARIANT_2) { \ VARIANT2_INTEGER_MATH(part, cl##part, c); \ lo = __umul128(idx, cl##part, &hi); \ VARIANT2_SHUFFLE2(l, idx & MASK, a, b0, b1, hi, lo); \ @@ -834,7 +840,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si } \ a = _mm_add_epi64(a, _mm_set_epi64x(lo, hi)); \ \ - if (IS_V1) { \ + if (BASE == xmrig::VARIANT_1) { \ _mm_store_si128(ptr, _mm_xor_si128(a, mc)); \ \ if (VARIANT == xmrig::VARIANT_TUBE || \ @@ -859,7 +865,7 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si \ idx = d ^ q; \ } \ - if (VARIANT == xmrig::VARIANT_2) { \ + if (BASE == xmrig::VARIANT_2) { \ b1 = b0; \ } \ b0 = c; @@ -869,11 +875,11 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si __m128i mc##n; \ __m128i division_result_xmm_##n; \ __m128i sqrt_result_xmm_##n; \ - if (IS_V1) { \ + if (BASE == xmrig::VARIANT_1) { \ mc##n = _mm_set_epi64x(*reinterpret_cast(input + n * size + 35) ^ \ *(reinterpret_cast((ctx)->state) + 24), 0); \ } \ - if (VARIANT == xmrig::VARIANT_2) { \ + if (BASE == xmrig::VARIANT_2) { \ division_result_xmm_##n = _mm_cvtsi64_si128(h##n[12]); \ sqrt_result_xmm_##n = _mm_cvtsi64_si128(h##n[13]); \ } \ @@ -886,12 +892,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si template inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { - constexpr size_t MASK = xmrig::cn_select_mask(); - constexpr size_t ITERATIONS = xmrig::cn_select_iter(); - constexpr size_t MEM = xmrig::cn_select_memory(); - constexpr bool IS_V1 = xmrig::cn_base_variant() == xmrig::VARIANT_1; + constexpr size_t MASK = xmrig::cn_select_mask(); + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + constexpr xmrig::Variant BASE = xmrig::cn_base_variant(); - if (IS_V1 && size < 43) { + if (BASE == xmrig::VARIANT_1 && size < 43) { memset(output, 0, 32 * 3); return; } @@ -950,12 +956,12 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si template inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { - constexpr size_t MASK = xmrig::cn_select_mask(); - constexpr size_t ITERATIONS = xmrig::cn_select_iter(); - constexpr size_t MEM = xmrig::cn_select_memory(); - constexpr bool IS_V1 = xmrig::cn_base_variant() == xmrig::VARIANT_1;; + constexpr size_t MASK = xmrig::cn_select_mask(); + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + constexpr xmrig::Variant BASE = xmrig::cn_base_variant(); - if (IS_V1 && size < 43) { + if (BASE == xmrig::VARIANT_1 && size < 43) { memset(output, 0, 32 * 4); return; } @@ -1023,12 +1029,12 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size template inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { - constexpr size_t MASK = xmrig::cn_select_mask(); - constexpr size_t ITERATIONS = xmrig::cn_select_iter(); - constexpr size_t MEM = xmrig::cn_select_memory(); - constexpr bool IS_V1 = xmrig::cn_base_variant() == xmrig::VARIANT_1; + constexpr size_t MASK = xmrig::cn_select_mask(); + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + constexpr xmrig::Variant BASE = xmrig::cn_base_variant(); - if (IS_V1 && size < 43) { + if (BASE == xmrig::VARIANT_1 && size < 43) { memset(output, 0, 32 * 5); return; } diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index b6e91a65..87978f38 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -152,6 +152,17 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a cryptonight_quad_hash, cryptonight_penta_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + # ifndef XMRIG_NO_AEON cryptonight_single_hash, cryptonight_double_hash, @@ -182,6 +193,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_XAO nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RTO nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_2 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_XTL2 # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, @@ -192,6 +204,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, # endif # ifndef XMRIG_NO_SUMO @@ -236,6 +249,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_XAO nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_RTO nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_2 + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, // VARIANT_XTL2 # else nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, @@ -246,6 +260,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, # endif # ifndef XMRIG_NO_ASM cryptonight_single_hash_asm, diff --git a/src/workers/CpuThread.h b/src/workers/CpuThread.h index 29ab9696..71c3173d 100644 --- a/src/workers/CpuThread.h +++ b/src/workers/CpuThread.h @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index 645259b6..39259d53 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -54,13 +54,14 @@ bool MultiWorker::selfTest() using namespace xmrig; if (m_thread->algorithm() == CRYPTONIGHT) { - return verify(VARIANT_0, test_output_v0) && - verify(VARIANT_1, test_output_v1) && - verify(VARIANT_2, test_output_v2) && - verify(VARIANT_XTL, test_output_xtl) && - verify(VARIANT_MSR, test_output_msr) && - verify(VARIANT_XAO, test_output_xao) && - verify(VARIANT_RTO, test_output_rto); + return verify(VARIANT_0, test_output_v0) && + verify(VARIANT_1, test_output_v1) && + verify(VARIANT_2, test_output_v2) && + verify(VARIANT_XTL, test_output_xtl) && + verify(VARIANT_MSR, test_output_msr) && + verify(VARIANT_XAO, test_output_xao) && + verify(VARIANT_RTO, test_output_rto) && + verify(VARIANT_XTL2, test_output_xtl2); } # ifndef XMRIG_NO_AEON diff --git a/src/workers/MultiWorker.h b/src/workers/MultiWorker.h index c08e4fbe..b9d07b52 100644 --- a/src/workers/MultiWorker.h +++ b/src/workers/MultiWorker.h @@ -7,7 +7,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by From 1382857c836a089317b2c9ff4c0ea4d042bb50bf Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 13 Jan 2019 15:24:04 +0700 Subject: [PATCH 16/26] #899 Fixed ARM build. --- src/crypto/CryptoNight_arm.h | 61 +++++++++++++++++++----------------- 1 file changed, 33 insertions(+), 28 deletions(-) diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 4fcebc3e..b8c5092f 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -8,7 +8,7 @@ * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett * Copyright 2018 SChernykh - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -402,12 +402,12 @@ static inline __m128i aes_round_tweak_div(const __m128i &in, const __m128i &key) } -template +template static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i bx1, __m128i cx) { uint64_t* mem_out = (uint64_t*)&l[idx]; - if (VARIANT == xmrig::VARIANT_2) { + if (BASE == xmrig::VARIANT_2) { VARIANT2_SHUFFLE(l, idx, ax0, bx0, bx1); _mm_store_si128((__m128i *)mem_out, _mm_xor_si128(bx0, cx)); } else { @@ -429,12 +429,12 @@ static inline void cryptonight_monero_tweak(const uint8_t* l, uint64_t idx, __m1 template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { - constexpr size_t MASK = xmrig::cn_select_mask(); - constexpr size_t ITERATIONS = xmrig::cn_select_iter(); - constexpr size_t MEM = xmrig::cn_select_memory(); - constexpr bool IS_V1 = xmrig::cn_base_variant() == xmrig::VARIANT_1; + constexpr size_t MASK = xmrig::cn_select_mask(); + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + constexpr xmrig::Variant BASE = xmrig::cn_base_variant(); - if (IS_V1 && size < 43) { + if (BASE == xmrig::VARIANT_1 && size < 43) { memset(output, 0, 32); return; } @@ -473,8 +473,8 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si cx = _mm_aesenc_si128(cx, ax0); } - if (IS_V1 || VARIANT == xmrig::VARIANT_2) { - cryptonight_monero_tweak(l0, idx0 & MASK, ax0, bx0, bx1, cx); + if (BASE == xmrig::VARIANT_1 || BASE == xmrig::VARIANT_2) { + cryptonight_monero_tweak(l0, idx0 & MASK, ax0, bx0, bx1, cx); } else { _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); } @@ -484,7 +484,8 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l0[idx0 & MASK])[0]; ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { VARIANT2_INTEGER_MATH(0, cl, cx); lo = __umul128(idx0, cl, &hi); VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx0, bx1, hi, lo); @@ -498,9 +499,9 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - if (IS_V1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { + if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0; - } else if (IS_V1) { + } else if (BASE == xmrig::VARIANT_1) { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; } else { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; @@ -525,9 +526,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si idx0 = d ^ q; } } - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { bx1 = bx0; } + bx0 = cx; } @@ -541,12 +544,12 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx) { - constexpr size_t MASK = xmrig::cn_select_mask(); - constexpr size_t ITERATIONS = xmrig::cn_select_iter(); - constexpr size_t MEM = xmrig::cn_select_memory(); - constexpr bool IS_V1 = xmrig::cn_base_variant() == xmrig::VARIANT_1; + constexpr size_t MASK = xmrig::cn_select_mask(); + constexpr size_t ITERATIONS = xmrig::cn_select_iter(); + constexpr size_t MEM = xmrig::cn_select_memory(); + constexpr xmrig::Variant BASE = xmrig::cn_base_variant(); - if (IS_V1 && size < 43) { + if (BASE == xmrig::VARIANT_1 && size < 43) { memset(output, 0, 64); return; } @@ -602,9 +605,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si cx1 = _mm_aesenc_si128(cx1, ax1); } - if (IS_V1 || (VARIANT == xmrig::VARIANT_2)) { - cryptonight_monero_tweak(l0, idx0 & MASK, ax0, bx00, bx01, cx0); - cryptonight_monero_tweak(l1, idx1 & MASK, ax1, bx10, bx11, cx1); + if (BASE == xmrig::VARIANT_1 || (BASE == xmrig::VARIANT_2)) { + cryptonight_monero_tweak(l0, idx0 & MASK, ax0, bx00, bx01, cx0); + cryptonight_monero_tweak(l1, idx1 & MASK, ax1, bx10, bx11, cx1); } else { _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx10, cx1)); @@ -616,7 +619,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l0[idx0 & MASK])[0]; ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { VARIANT2_INTEGER_MATH(0, cl, cx0); lo = __umul128(idx0, cl, &hi); VARIANT2_SHUFFLE2(l0, idx0 & MASK, ax0, bx00, bx01, hi, lo); @@ -629,9 +633,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - if (IS_V1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { + if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0; - } else if (IS_V1) { + } else if (BASE == xmrig::VARIANT_1) { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; } else { ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; @@ -659,7 +663,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - if (VARIANT == xmrig::VARIANT_2) { + + if (BASE == xmrig::VARIANT_2) { VARIANT2_INTEGER_MATH(1, cl, cx1); lo = __umul128(idx1, cl, &hi); VARIANT2_SHUFFLE2(l1, idx1 & MASK, ax1, bx10, bx11, hi, lo); @@ -672,9 +677,9 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si ((uint64_t*)&l1[idx1 & MASK])[0] = al1; - if (IS_V1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { + if (BASE == xmrig::VARIANT_1 && (VARIANT == xmrig::VARIANT_TUBE || VARIANT == xmrig::VARIANT_RTO)) { ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1 ^ al1; - } else if (IS_V1) { + } else if (BASE == xmrig::VARIANT_1) { ((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1; } else { ((uint64_t*)&l1[idx1 & MASK])[1] = ah1; From 67863a427d2443f45842f9ac3df6f312e31456f7 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 13 Jan 2019 19:42:27 +0700 Subject: [PATCH 17/26] Rename "cn/xtlv9" to "cn/half". --- src/common/crypto/Algorithm.cpp | 11 ++++++++--- src/common/net/Client.cpp | 3 +-- src/common/net/Job.cpp | 5 ++++- src/common/net/Pool.cpp | 2 +- src/common/xmrig.h | 2 +- src/crypto/CryptoNight_constants.h | 17 +++++++---------- src/crypto/CryptoNight_test.h | 4 ++-- src/workers/CpuThread.cpp | 20 ++++++++++---------- src/workers/MultiWorker.cpp | 2 +- 9 files changed, 35 insertions(+), 31 deletions(-) diff --git a/src/common/crypto/Algorithm.cpp b/src/common/crypto/Algorithm.cpp index d8131ceb..909c772a 100644 --- a/src/common/crypto/Algorithm.cpp +++ b/src/common/crypto/Algorithm.cpp @@ -62,7 +62,8 @@ static AlgoData const algorithms[] = { { "cryptonight/xao", "cn/xao", xmrig::CRYPTONIGHT, xmrig::VARIANT_XAO }, { "cryptonight/rto", "cn/rto", xmrig::CRYPTONIGHT, xmrig::VARIANT_RTO }, { "cryptonight/2", "cn/2", xmrig::CRYPTONIGHT, xmrig::VARIANT_2 }, - { "cryptonight/xtlv9", "cn/xtlv9", xmrig::CRYPTONIGHT, xmrig::VARIANT_XTL2 }, + { "cryptonight/half", "cn/half", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF }, + { "cryptonight/xtlv9", "cn/xtlv9", xmrig::CRYPTONIGHT, xmrig::VARIANT_HALF }, # ifndef XMRIG_NO_AEON { "cryptonight-lite", "cn-lite", xmrig::CRYPTONIGHT_LITE, xmrig::VARIANT_AUTO }, @@ -110,7 +111,7 @@ static const char *variants[] = { "xao", "rto", "2", - "xtlv9" + "half" }; @@ -174,9 +175,13 @@ void xmrig::Algorithm::parseVariant(const char *variant) for (size_t i = 0; i < ARRAY_SIZE(variants); i++) { if (strcasecmp(variant, variants[i]) == 0) { m_variant = static_cast(i); - break; + return; } } + + if (strcasecmp(variant, "xtlv9") == 0) { + m_variant = VARIANT_HALF; + } } diff --git a/src/common/net/Client.cpp b/src/common/net/Client.cpp index 8458b1e2..6da63942 100644 --- a/src/common/net/Client.cpp +++ b/src/common/net/Client.cpp @@ -342,8 +342,7 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code) if (params.HasMember("algo")) { job.setAlgorithm(params["algo"].GetString()); } - - if (params.HasMember("variant")) { + else if (params.HasMember("variant")) { const rapidjson::Value &variant = params["variant"]; if (variant.IsInt()) { diff --git a/src/common/net/Job.cpp b/src/common/net/Job.cpp index a0108d6a..2eb84f18 100644 --- a/src/common/net/Job.cpp +++ b/src/common/net/Job.cpp @@ -125,7 +125,10 @@ bool Job::setBlob(const char *blob) m_algorithm.setVariant(variant()); } else if (m_algorithm.variant() == xmrig::VARIANT_XTL && m_blob[0] >= 9) { - m_algorithm.setVariant(xmrig::VARIANT_XTL2); + m_algorithm.setVariant(xmrig::VARIANT_HALF); + } + else if (m_algorithm.variant() == xmrig::VARIANT_MSR && m_blob[0] >= 8) { + m_algorithm.setVariant(xmrig::VARIANT_HALF); } # ifdef XMRIG_PROXY_PROJECT diff --git a/src/common/net/Pool.cpp b/src/common/net/Pool.cpp index 617a03b9..a44f8a41 100644 --- a/src/common/net/Pool.cpp +++ b/src/common/net/Pool.cpp @@ -412,7 +412,7 @@ void Pool::rebuild() addVariant(xmrig::VARIANT_2); addVariant(xmrig::VARIANT_1); addVariant(xmrig::VARIANT_0); - addVariant(xmrig::VARIANT_XTL2); + addVariant(xmrig::VARIANT_HALF); addVariant(xmrig::VARIANT_XTL); addVariant(xmrig::VARIANT_TUBE); addVariant(xmrig::VARIANT_MSR); diff --git a/src/common/xmrig.h b/src/common/xmrig.h index b60c4bd7..883d866d 100644 --- a/src/common/xmrig.h +++ b/src/common/xmrig.h @@ -69,7 +69,7 @@ enum Variant { VARIANT_XAO = 6, // Modified CryptoNight variant 0 (Alloy only) VARIANT_RTO = 7, // Modified CryptoNight variant 1 (Arto only) VARIANT_2 = 8, // CryptoNight variant 2 - VARIANT_XTL2 = 9, + VARIANT_HALF = 9, // CryptoNight variant 2 with half iterations (Masari/Stellite) VARIANT_MAX }; diff --git a/src/crypto/CryptoNight_constants.h b/src/crypto/CryptoNight_constants.h index 74f725ee..f0032305 100644 --- a/src/crypto/CryptoNight_constants.h +++ b/src/crypto/CryptoNight_constants.h @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -38,9 +38,8 @@ namespace xmrig constexpr const size_t CRYPTONIGHT_MEMORY = 2 * 1024 * 1024; constexpr const uint32_t CRYPTONIGHT_MASK = 0x1FFFF0; constexpr const uint32_t CRYPTONIGHT_ITER = 0x80000; -constexpr const uint32_t CRYPTONIGHT_MSR_ITER = 0x40000; +constexpr const uint32_t CRYPTONIGHT_HALF_ITER = 0x40000; constexpr const uint32_t CRYPTONIGHT_XAO_ITER = 0x100000; -constexpr const uint32_t CRYPTONIGHT_XTL2_ITER = 0x40000; constexpr const size_t CRYPTONIGHT_LITE_MEMORY = 1 * 1024 * 1024; constexpr const uint32_t CRYPTONIGHT_LITE_MASK = 0xFFFF0; @@ -110,8 +109,8 @@ template<> inline constexpr uint32_t cn_select_iter() template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } -template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_XTL2_ITER; } -template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_MSR_ITER; } +template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HALF_ITER; } +template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_HALF_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_XAO_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_ITER; } template<> inline constexpr uint32_t cn_select_iter() { return CRYPTONIGHT_LITE_ITER; } @@ -125,14 +124,12 @@ inline uint32_t cn_select_iter(Algo algorithm, Variant variant) { switch (variant) { case VARIANT_MSR: - return CRYPTONIGHT_MSR_ITER; + case VARIANT_HALF: + return CRYPTONIGHT_HALF_ITER; case VARIANT_RTO: return CRYPTONIGHT_XAO_ITER; - case VARIANT_XTL2: - return CRYPTONIGHT_XTL2_ITER; - default: break; } @@ -166,7 +163,7 @@ template<> inline constexpr Variant cn_base_variant() { return VA template<> inline constexpr Variant cn_base_variant() { return VARIANT_0; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_1; } template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } -template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } +template<> inline constexpr Variant cn_base_variant() { return VARIANT_2; } } /* namespace xmrig */ diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index dcbd85d2..63550ed8 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -115,8 +115,8 @@ const static uint8_t test_output_xtl[160] = { }; -// "cn/xtl2" -const static uint8_t test_output_xtl2[160] = { +// "cn/half" +const static uint8_t test_output_half[160] = { 0x5D, 0x4F, 0xBC, 0x35, 0x60, 0x97, 0xEA, 0x64, 0x40, 0xB0, 0x88, 0x8E, 0xDE, 0xB6, 0x35, 0xDD, 0xC8, 0x4A, 0x0E, 0x39, 0x7C, 0x86, 0x84, 0x56, 0x89, 0x5C, 0x3F, 0x29, 0xBE, 0x73, 0x12, 0xA7, 0x02, 0xE6, 0x1D, 0x2B, 0xBC, 0x84, 0xB6, 0x71, 0x96, 0x71, 0xD5, 0x0C, 0xAC, 0x76, 0x0E, 0x6B, diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 87978f38..e03e6b5a 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -152,16 +152,16 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a cryptonight_quad_hash, cryptonight_penta_hash, - cryptonight_single_hash, - cryptonight_double_hash, - cryptonight_single_hash, - cryptonight_double_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, - cryptonight_triple_hash, - cryptonight_quad_hash, - cryptonight_penta_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_single_hash, + cryptonight_double_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, + cryptonight_triple_hash, + cryptonight_quad_hash, + cryptonight_penta_hash, # ifndef XMRIG_NO_AEON cryptonight_single_hash, diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index 39259d53..f0188322 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -61,7 +61,7 @@ bool MultiWorker::selfTest() verify(VARIANT_MSR, test_output_msr) && verify(VARIANT_XAO, test_output_xao) && verify(VARIANT_RTO, test_output_rto) && - verify(VARIANT_XTL2, test_output_xtl2); + verify(VARIANT_HALF, test_output_half); } # ifndef XMRIG_NO_AEON From b43336582d87fcf6b39e8d20f66fc8abfa6210b2 Mon Sep 17 00:00:00 2001 From: XMRig Date: Sun, 13 Jan 2019 23:00:41 +0700 Subject: [PATCH 18/26] Rename ASM files. --- cmake/asm.cmake | 8 ++++---- .../asm/{ => cn2}/cnv2_double_main_loop_sandybridge.inc | 0 src/crypto/asm/{ => cn2}/cnv2_main_loop_bulldozer.inc | 0 src/crypto/asm/{ => cn2}/cnv2_main_loop_ivybridge.inc | 0 src/crypto/asm/{ => cn2}/cnv2_main_loop_ryzen.inc | 0 src/crypto/asm/{cnv2_main_loop.S => cn_main_loop.S} | 8 ++++---- src/crypto/asm/{cnv2_main_loop.asm => cn_main_loop.asm} | 8 ++++---- .../win64/{ => cn2}/cnv2_double_main_loop_sandybridge.inc | 0 .../asm/win64/{ => cn2}/cnv2_main_loop_bulldozer.inc | 0 .../asm/win64/{ => cn2}/cnv2_main_loop_ivybridge.inc | 0 src/crypto/asm/win64/{ => cn2}/cnv2_main_loop_ryzen.inc | 0 src/crypto/asm/win64/{cnv2_main_loop.S => cn_main_loop.S} | 8 ++++---- .../asm/win64/{cnv2_main_loop.asm => cn_main_loop.asm} | 8 ++++---- 13 files changed, 20 insertions(+), 20 deletions(-) rename src/crypto/asm/{ => cn2}/cnv2_double_main_loop_sandybridge.inc (100%) rename src/crypto/asm/{ => cn2}/cnv2_main_loop_bulldozer.inc (100%) rename src/crypto/asm/{ => cn2}/cnv2_main_loop_ivybridge.inc (100%) rename src/crypto/asm/{ => cn2}/cnv2_main_loop_ryzen.inc (100%) rename src/crypto/asm/{cnv2_main_loop.S => cn_main_loop.S} (80%) rename src/crypto/asm/{cnv2_main_loop.asm => cn_main_loop.asm} (76%) rename src/crypto/asm/win64/{ => cn2}/cnv2_double_main_loop_sandybridge.inc (100%) rename src/crypto/asm/win64/{ => cn2}/cnv2_main_loop_bulldozer.inc (100%) rename src/crypto/asm/win64/{ => cn2}/cnv2_main_loop_ivybridge.inc (100%) rename src/crypto/asm/win64/{ => cn2}/cnv2_main_loop_ryzen.inc (100%) rename src/crypto/asm/win64/{cnv2_main_loop.S => cn_main_loop.S} (66%) rename src/crypto/asm/win64/{cnv2_main_loop.asm => cn_main_loop.asm} (76%) diff --git a/cmake/asm.cmake b/cmake/asm.cmake index 358d5666..bdb5d134 100644 --- a/cmake/asm.cmake +++ b/cmake/asm.cmake @@ -5,9 +5,9 @@ if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) enable_language(ASM_MASM) if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141) - set(XMRIG_ASM_FILE "src/crypto/asm/cnv2_main_loop.asm") + set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.asm") else() - set(XMRIG_ASM_FILE "src/crypto/asm/win64/cnv2_main_loop.asm") + set(XMRIG_ASM_FILE "src/crypto/asm/win64/cn_main_loop.asm") endif() set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) @@ -15,9 +15,9 @@ if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) enable_language(ASM) if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) - set(XMRIG_ASM_FILE "src/crypto/asm/win64/cnv2_main_loop.S") + set(XMRIG_ASM_FILE "src/crypto/asm/win64/cn_main_loop.S") else() - set(XMRIG_ASM_FILE "src/crypto/asm/cnv2_main_loop.S") + set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S") endif() set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) diff --git a/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc similarity index 100% rename from src/crypto/asm/cnv2_double_main_loop_sandybridge.inc rename to src/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc diff --git a/src/crypto/asm/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc similarity index 100% rename from src/crypto/asm/cnv2_main_loop_bulldozer.inc rename to src/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc diff --git a/src/crypto/asm/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc similarity index 100% rename from src/crypto/asm/cnv2_main_loop_ivybridge.inc rename to src/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc diff --git a/src/crypto/asm/cnv2_main_loop_ryzen.inc b/src/crypto/asm/cn2/cnv2_main_loop_ryzen.inc similarity index 100% rename from src/crypto/asm/cnv2_main_loop_ryzen.inc rename to src/crypto/asm/cn2/cnv2_main_loop_ryzen.inc diff --git a/src/crypto/asm/cnv2_main_loop.S b/src/crypto/asm/cn_main_loop.S similarity index 80% rename from src/crypto/asm/cnv2_main_loop.S rename to src/crypto/asm/cn_main_loop.S index a23f24bf..b134a013 100644 --- a/src/crypto/asm/cnv2_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -16,7 +16,7 @@ ALIGN 16 FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cnv2_main_loop_ivybridge.inc" + #include "cn2/cnv2_main_loop_ivybridge.inc" add rsp, 48 ret 0 @@ -24,7 +24,7 @@ ALIGN 16 FN_PREFIX(cnv2_mainloop_ryzen_asm): sub rsp, 48 mov rcx, rdi - #include "cnv2_main_loop_ryzen.inc" + #include "cn2/cnv2_main_loop_ryzen.inc" add rsp, 48 ret 0 @@ -32,7 +32,7 @@ ALIGN 16 FN_PREFIX(cnv2_mainloop_bulldozer_asm): sub rsp, 48 mov rcx, rdi - #include "cnv2_main_loop_bulldozer.inc" + #include "cn2/cnv2_main_loop_bulldozer.inc" add rsp, 48 ret 0 @@ -41,6 +41,6 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): sub rsp, 48 mov rcx, rdi mov rdx, rsi - #include "cnv2_double_main_loop_sandybridge.inc" + #include "cn2/cnv2_double_main_loop_sandybridge.inc" add rsp, 48 ret 0 diff --git a/src/crypto/asm/cnv2_main_loop.asm b/src/crypto/asm/cn_main_loop.asm similarity index 76% rename from src/crypto/asm/cnv2_main_loop.asm rename to src/crypto/asm/cn_main_loop.asm index 557f1ab6..47194f1f 100644 --- a/src/crypto/asm/cnv2_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -6,25 +6,25 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC - INCLUDE cnv2_main_loop_ivybridge.inc + INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC - INCLUDE cnv2_main_loop_ryzen.inc + INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC - INCLUDE cnv2_main_loop_bulldozer.inc + INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC - INCLUDE cnv2_double_main_loop_sandybridge.inc + INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 cnv2_double_mainloop_sandybridge_asm ENDP diff --git a/src/crypto/asm/win64/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc similarity index 100% rename from src/crypto/asm/win64/cnv2_double_main_loop_sandybridge.inc rename to src/crypto/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc diff --git a/src/crypto/asm/win64/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/win64/cn2/cnv2_main_loop_bulldozer.inc similarity index 100% rename from src/crypto/asm/win64/cnv2_main_loop_bulldozer.inc rename to src/crypto/asm/win64/cn2/cnv2_main_loop_bulldozer.inc diff --git a/src/crypto/asm/win64/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/win64/cn2/cnv2_main_loop_ivybridge.inc similarity index 100% rename from src/crypto/asm/win64/cnv2_main_loop_ivybridge.inc rename to src/crypto/asm/win64/cn2/cnv2_main_loop_ivybridge.inc diff --git a/src/crypto/asm/win64/cnv2_main_loop_ryzen.inc b/src/crypto/asm/win64/cn2/cnv2_main_loop_ryzen.inc similarity index 100% rename from src/crypto/asm/win64/cnv2_main_loop_ryzen.inc rename to src/crypto/asm/win64/cn2/cnv2_main_loop_ryzen.inc diff --git a/src/crypto/asm/win64/cnv2_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S similarity index 66% rename from src/crypto/asm/win64/cnv2_main_loop.S rename to src/crypto/asm/win64/cn_main_loop.S index 1be27c64..1793cd16 100644 --- a/src/crypto/asm/win64/cnv2_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -8,20 +8,20 @@ ALIGN 16 cnv2_mainloop_ivybridge_asm: - #include "../cnv2_main_loop_ivybridge.inc" + #include "../cn2/cnv2_main_loop_ivybridge.inc" ret 0 ALIGN 16 cnv2_mainloop_ryzen_asm: - #include "../cnv2_main_loop_ryzen.inc" + #include "../cn2/cnv2_main_loop_ryzen.inc" ret 0 ALIGN 16 cnv2_mainloop_bulldozer_asm: - #include "../cnv2_main_loop_bulldozer.inc" + #include "../cn2/cnv2_main_loop_bulldozer.inc" ret 0 ALIGN 16 cnv2_double_mainloop_sandybridge_asm: - #include "../cnv2_double_main_loop_sandybridge.inc" + #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 diff --git a/src/crypto/asm/win64/cnv2_main_loop.asm b/src/crypto/asm/win64/cn_main_loop.asm similarity index 76% rename from src/crypto/asm/win64/cnv2_main_loop.asm rename to src/crypto/asm/win64/cn_main_loop.asm index 557f1ab6..47194f1f 100644 --- a/src/crypto/asm/win64/cnv2_main_loop.asm +++ b/src/crypto/asm/win64/cn_main_loop.asm @@ -6,25 +6,25 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC - INCLUDE cnv2_main_loop_ivybridge.inc + INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC - INCLUDE cnv2_main_loop_ryzen.inc + INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC - INCLUDE cnv2_main_loop_bulldozer.inc + INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC - INCLUDE cnv2_double_main_loop_sandybridge.inc + INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 cnv2_double_mainloop_sandybridge_asm ENDP From 492449e9fbc80f76afdc2852d4355e8b7670bdf4 Mon Sep 17 00:00:00 2001 From: XMRig Date: Mon, 14 Jan 2019 18:09:16 +0700 Subject: [PATCH 19/26] #899 Add ASM implementation for cn/half. --- src/crypto/CryptoNight_x86.h | 39 +- .../cn_half_double_main_loop_sandybridge.inc | 410 ++++++++++++++++++ .../cn_half/cn_half_main_loop_bulldozer.inc | 180 ++++++++ .../cn_half/cn_half_main_loop_ivybridge.inc | 186 ++++++++ .../asm/cn_half/cn_half_main_loop_ryzen.inc | 179 ++++++++ src/crypto/asm/cn_main_loop.S | 38 ++ src/crypto/asm/cn_main_loop.asm | 28 ++ .../cn_half_double_main_loop_sandybridge.inc | 410 ++++++++++++++++++ .../cn_half/cn_half_main_loop_bulldozer.inc | 180 ++++++++ .../cn_half/cn_half_main_loop_ivybridge.inc | 186 ++++++++ .../win64/cn_half/cn_half_main_loop_ryzen.inc | 179 ++++++++ src/crypto/asm/win64/cn_main_loop.S | 25 ++ src/crypto/asm/win64/cn_main_loop.asm | 28 ++ src/workers/CpuThread.cpp | 20 +- 14 files changed, 2075 insertions(+), 13 deletions(-) create mode 100644 src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc create mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc create mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc create mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc create mode 100644 src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc create mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc create mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc create mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index f717d035..fef3dc19 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -570,6 +570,11 @@ extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); +extern "C" void cn_half_mainloop_ivybridge_asm(cryptonight_ctx *ctx); +extern "C" void cn_half_mainloop_ryzen_asm(cryptonight_ctx *ctx); +extern "C" void cn_half_mainloop_bulldozer_asm(cryptonight_ctx *ctx); +extern "C" void cn_half_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); + template inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) @@ -579,14 +584,27 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_ xmrig::keccak(input, size, ctx[0]->state); cn_explode_scratchpad(reinterpret_cast<__m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory)); - if (ASM == xmrig::ASM_INTEL) { - cnv2_mainloop_ivybridge_asm(ctx[0]); + if (VARIANT == xmrig::VARIANT_2) { + if (ASM == xmrig::ASM_INTEL) { + cnv2_mainloop_ivybridge_asm(ctx[0]); + } + else if (ASM == xmrig::ASM_RYZEN) { + cnv2_mainloop_ryzen_asm(ctx[0]); + } + else { + cnv2_mainloop_bulldozer_asm(ctx[0]); + } } - else if (ASM == xmrig::ASM_RYZEN) { - cnv2_mainloop_ryzen_asm(ctx[0]); - } - else { - cnv2_mainloop_bulldozer_asm(ctx[0]); + else if (VARIANT == xmrig::VARIANT_HALF) { + if (ASM == xmrig::ASM_INTEL) { + cn_half_mainloop_ivybridge_asm(ctx[0]); + } + else if (ASM == xmrig::ASM_RYZEN) { + cn_half_mainloop_ryzen_asm(ctx[0]); + } + else { + cn_half_mainloop_bulldozer_asm(ctx[0]); + } } cn_implode_scratchpad(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); @@ -606,7 +624,12 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ cn_explode_scratchpad(reinterpret_cast<__m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory)); cn_explode_scratchpad(reinterpret_cast<__m128i*>(ctx[1]->state), reinterpret_cast<__m128i*>(ctx[1]->memory)); - cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); + if (VARIANT == xmrig::VARIANT_2) { + cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); + } + else if (VARIANT == xmrig::VARIANT_HALF) { + cn_half_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); + } cn_implode_scratchpad(reinterpret_cast<__m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state)); cn_implode_scratchpad(reinterpret_cast<__m128i*>(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state)); diff --git a/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc b/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..2497ef95 --- /dev/null +++ b/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 262144 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 16 +main_loop_double_half_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_half_sandybridge +div_fix_1_ret_half_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_half_sandybridge +div_fix_2_ret_half_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_half_sandybridge +sqrt_fix_1_ret_half_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_half_sandybridge +sqrt_fix_2_ret_half_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_half_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_half_sandybridge_endp + +div_fix_1_half_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_half_sandybridge + +div_fix_2_half_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_half_sandybridge + +sqrt_fix_1_half_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_half_sandybridge + +sqrt_fix_2_half_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_half_sandybridge + +cnv2_double_mainloop_asm_half_sandybridge_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc b/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc new file mode 100644 index 00000000..460f9b66 --- /dev/null +++ b/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_half_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movq r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movq xmm0, rax + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_half_bulldozer + shr rdi, 19 + +sqrt_fixup_half_bulldozer_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_half_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_half_bulldozer_endp + +sqrt_fixup_half_bulldozer: + movq r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_half_bulldozer_ret + +cnv2_main_loop_half_bulldozer_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc b/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc new file mode 100644 index 00000000..51b82bec --- /dev/null +++ b/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 262144 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 16 +main_loop_half_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movq rdx, xmm3 + test edx, 524287 + je sqrt_fixup_half_ivybridge + psrlq xmm3, 19 +sqrt_fixup_half_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne main_loop_half_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_half_ivybridge_endp + +sqrt_fixup_half_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp sqrt_fixup_half_ivybridge_ret + +cnv2_main_loop_half_ivybridge_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc b/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc new file mode 100644 index 00000000..8da3d8c4 --- /dev/null +++ b/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +main_loop_half_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_half_ryzen + shr rdi, 19 + +sqrt_fixup_half_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_half_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_half_ryzen_endp + +sqrt_fixup_half_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_half_ryzen_ret + +cnv2_main_loop_half_ryzen_endp: diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index b134a013..95905d34 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -12,6 +12,11 @@ .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_half_mainloop_ivybridge_asm) +.global FN_PREFIX(cn_half_mainloop_ryzen_asm) +.global FN_PREFIX(cn_half_mainloop_bulldozer_asm) +.global FN_PREFIX(cn_half_double_mainloop_sandybridge_asm) + ALIGN 16 FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 @@ -44,3 +49,36 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): #include "cn2/cnv2_double_main_loop_sandybridge.inc" add rsp, 48 ret 0 + +ALIGN 16 +FN_PREFIX(cn_half_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_half/cn_half_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cn_half_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_half/cn_half_main_loop_ryzen.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cn_half_mainloop_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_half/cn_half_main_loop_bulldozer.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cn_half_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cn_half/cn_half_double_main_loop_sandybridge.inc" + add rsp, 48 + ret 0 diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm index 47194f1f..fefb77a3 100644 --- a/src/crypto/asm/cn_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -3,6 +3,10 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm +PUBLIC cn_half_mainloop_ivybridge_asm +PUBLIC cn_half_mainloop_ryzen_asm +PUBLIC cn_half_mainloop_bulldozer_asm +PUBLIC cn_half_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC @@ -28,5 +32,29 @@ cnv2_double_mainloop_sandybridge_asm PROC ret 0 cnv2_double_mainloop_sandybridge_asm ENDP +ALIGN 64 +cn_half_mainloop_ivybridge_asm PROC + INCLUDE cn_half/cn_half_main_loop_ivybridge.inc + ret 0 +cn_half_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cn_half_mainloop_ryzen_asm PROC + INCLUDE cn_half/cn_half_main_loop_ryzen.inc + ret 0 +cn_half_mainloop_ryzen_asm ENDP + +ALIGN 64 +cn_half_mainloop_bulldozer_asm PROC + INCLUDE cn_half/cn_half_main_loop_bulldozer.inc + ret 0 +cn_half_mainloop_bulldozer_asm ENDP + +ALIGN 64 +cn_half_double_mainloop_sandybridge_asm PROC + INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc + ret 0 +cn_half_double_mainloop_sandybridge_asm ENDP + _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc b/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..0c207f21 --- /dev/null +++ b/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 262144 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movd xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movd xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movd xmm5, QWORD PTR [r8+104] + movd xmm7, rax + + mov eax, 1 + shl rax, 52 + movd xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movd xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movd xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movd xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movd xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movd xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 16 +main_loop_double_half_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movd xmm0, r11 + movd xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movd r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movd xmm0, rbp + movd xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movd rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movd rdx, xmm5 + shl rdx, 32 + movd rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movd xmm0, rdx + xor rdx, [r11+r13] + movd xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movd r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movd r11, xmm0 + psrldq xmm1, 8 + movd r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movd rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movd rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movd r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_half_sandybridge +div_fix_1_ret_half_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_half_sandybridge +div_fix_2_ret_half_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movd r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_half_sandybridge +sqrt_fix_1_ret_half_sandybridge: + + movd r9, xmm10 + psrldq xmm1, 8 + movd r8, xmm1 + test r8, 524287 + je sqrt_fix_2_half_sandybridge +sqrt_fix_2_ret_half_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_half_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_half_sandybridge_endp + +div_fix_1_half_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_half_sandybridge + +div_fix_2_half_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_half_sandybridge + +sqrt_fix_1_half_sandybridge: + movd r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movd xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_half_sandybridge + +sqrt_fix_2_half_sandybridge: + psrldq xmm3, 8 + movd r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movd xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_half_sandybridge + +cnv2_double_mainloop_asm_half_sandybridge_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc new file mode 100644 index 00000000..6597c791 --- /dev/null +++ b/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_half_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movd r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movd xmm0, rax + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_half_bulldozer + shr rdi, 19 + +sqrt_fixup_half_bulldozer_ret: + mov rax, rsi + mul r14 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_half_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_half_bulldozer_endp + +sqrt_fixup_half_bulldozer: + movd r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_half_bulldozer_ret + +cnv2_main_loop_half_bulldozer_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc new file mode 100644 index 00000000..c769f827 --- /dev/null +++ b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 262144 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movd xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movd xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movd xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 16 +main_loop_half_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movd xmm0, r11 + movd xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movd rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movd rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movd rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movd rdx, xmm3 + test edx, 524287 + je sqrt_fixup_half_ivybridge + psrlq xmm3, 19 +sqrt_fixup_half_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movd xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movd xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne main_loop_half_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_half_ivybridge_endp + +sqrt_fixup_half_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movd xmm3, rdx + jmp sqrt_fixup_half_ivybridge_ret + +cnv2_main_loop_half_ivybridge_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc new file mode 100644 index 00000000..0744aaa4 --- /dev/null +++ b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +main_loop_half_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm0, r11 + movd xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movd r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movd rax, xmm0 + + div r9 + movd xmm0, rax + movd xmm1, rdx + punpckldq xmm0, xmm1 + movd r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_half_ryzen + shr rdi, 19 + +sqrt_fixup_half_ryzen_ret: + mov rax, rsi + mul r14 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_half_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_half_ryzen_endp + +sqrt_fixup_half_ryzen: + movd r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_half_ryzen_ret + +cnv2_main_loop_half_ryzen_endp: diff --git a/src/crypto/asm/win64/cn_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S index 1793cd16..691209f8 100644 --- a/src/crypto/asm/win64/cn_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -6,6 +6,11 @@ .global cnv2_mainloop_bulldozer_asm .global cnv2_double_mainloop_sandybridge_asm +.global cn_half_mainloop_ivybridge_asm +.global cn_half_mainloop_ryzen_asm +.global cn_half_mainloop_bulldozer_asm +.global cn_half_double_mainloop_sandybridge_asm + ALIGN 16 cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" @@ -25,3 +30,23 @@ ALIGN 16 cnv2_double_mainloop_sandybridge_asm: #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 + +ALIGN 16 +cn_half_mainloop_ivybridge_asm: + #include "../cn_half/cn_half_main_loop_ivybridge.inc" + ret 0 + +ALIGN 16 +cn_half_mainloop_ryzen_asm: + #include "../cn_half/cn_half_main_loop_ryzen.inc" + ret 0 + +ALIGN 16 +cn_half_mainloop_bulldozer_asm: + #include "../cn_half/cn_half_main_loop_bulldozer.inc" + ret 0 + +ALIGN 16 +cn_half_double_mainloop_sandybridge_asm: + #include "../cn_half/cn_half_double_main_loop_sandybridge.inc" + ret 0 diff --git a/src/crypto/asm/win64/cn_main_loop.asm b/src/crypto/asm/win64/cn_main_loop.asm index 47194f1f..fefb77a3 100644 --- a/src/crypto/asm/win64/cn_main_loop.asm +++ b/src/crypto/asm/win64/cn_main_loop.asm @@ -3,6 +3,10 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm +PUBLIC cn_half_mainloop_ivybridge_asm +PUBLIC cn_half_mainloop_ryzen_asm +PUBLIC cn_half_mainloop_bulldozer_asm +PUBLIC cn_half_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC @@ -28,5 +32,29 @@ cnv2_double_mainloop_sandybridge_asm PROC ret 0 cnv2_double_mainloop_sandybridge_asm ENDP +ALIGN 64 +cn_half_mainloop_ivybridge_asm PROC + INCLUDE cn_half/cn_half_main_loop_ivybridge.inc + ret 0 +cn_half_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cn_half_mainloop_ryzen_asm PROC + INCLUDE cn_half/cn_half_main_loop_ryzen.inc + ret 0 +cn_half_mainloop_ryzen_asm ENDP + +ALIGN 64 +cn_half_mainloop_bulldozer_asm PROC + INCLUDE cn_half/cn_half_main_loop_bulldozer.inc + ret 0 +cn_half_mainloop_bulldozer_asm ENDP + +ALIGN 64 +cn_half_double_mainloop_sandybridge_asm PROC + INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc + ret 0 +cn_half_double_mainloop_sandybridge_asm ENDP + _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index e03e6b5a..9a98b4e3 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -65,7 +65,7 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a assert(variant >= VARIANT_0 && variant < VARIANT_MAX); # ifndef XMRIG_NO_ASM - constexpr const size_t count = VARIANT_MAX * 10 * 3 + 4; + constexpr const size_t count = VARIANT_MAX * 10 * 3 + 8; # else constexpr const size_t count = VARIANT_MAX * 10 * 3; # endif @@ -266,7 +266,12 @@ xmrig::CpuThread::cn_hash_fun xmrig::CpuThread::fn(Algo algorithm, AlgoVariant a cryptonight_single_hash_asm, cryptonight_single_hash_asm, cryptonight_single_hash_asm, - cryptonight_double_hash_asm + cryptonight_double_hash_asm, + + cryptonight_single_hash_asm, + cryptonight_single_hash_asm, + cryptonight_single_hash_asm, + cryptonight_double_hash_asm # endif }; @@ -457,14 +462,19 @@ size_t xmrig::CpuThread::fnIndex(Algo algorithm, AlgoVariant av, Variant variant } constexpr const size_t offset = VARIANT_MAX * 10 * 3; + size_t extra_offset = 0; + + if (algorithm == CRYPTONIGHT && (variant == VARIANT_2 || variant == VARIANT_HALF)) { + if (variant == VARIANT_HALF) { + extra_offset += 4; + } - if (algorithm == CRYPTONIGHT && variant == VARIANT_2) { if (av == AV_SINGLE) { - return offset + assembly - 2; + return offset + extra_offset + assembly - 2; } if (av == AV_DOUBLE) { - return offset + 3; + return offset + 3 + extra_offset; } } # endif From eede1b48812d7fd875a53f07b539d445dbf1aa52 Mon Sep 17 00:00:00 2001 From: XMRig Date: Mon, 14 Jan 2019 20:59:39 +0700 Subject: [PATCH 20/26] Allow ignore block version. --- src/common/crypto/Algorithm.cpp | 18 +++++++++++++++++- src/common/crypto/Algorithm.h | 25 ++++++++++++++++++------- src/common/net/Job.cpp | 13 ++++++++----- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/common/crypto/Algorithm.cpp b/src/common/crypto/Algorithm.cpp index 909c772a..2f259372 100644 --- a/src/common/crypto/Algorithm.cpp +++ b/src/common/crypto/Algorithm.cpp @@ -150,10 +150,16 @@ void xmrig::Algorithm::parseAlgorithm(const char *algo) m_variant = VARIANT_AUTO; assert(algo != nullptr); - if (algo == nullptr) { + if (algo == nullptr || strlen(algo) < 1) { return; } + if (*algo == '!') { + m_flags |= Forced; + + return parseAlgorithm(algo + 1); + } + for (size_t i = 0; i < ARRAY_SIZE(algorithms); i++) { if ((strcasecmp(algo, algorithms[i].name) == 0) || (strcasecmp(algo, algorithms[i].shortName) == 0)) { m_algo = algorithms[i].algo; @@ -172,6 +178,16 @@ void xmrig::Algorithm::parseVariant(const char *variant) { m_variant = VARIANT_AUTO; + if (variant == nullptr || strlen(variant) < 1) { + return; + } + + if (*variant == '!') { + m_flags |= Forced; + + return parseVariant(variant + 1); + } + for (size_t i = 0; i < ARRAY_SIZE(variants); i++) { if (strcasecmp(variant, variants[i]) == 0) { m_variant = static_cast(i); diff --git a/src/common/crypto/Algorithm.h b/src/common/crypto/Algorithm.h index 4a975ad1..f4380d45 100644 --- a/src/common/crypto/Algorithm.h +++ b/src/common/crypto/Algorithm.h @@ -39,28 +39,38 @@ namespace xmrig { class Algorithm { public: + enum Flags { + None = 0, + Forced = 1 + }; + inline Algorithm() : m_algo(INVALID_ALGO), + m_flags(0), m_variant(VARIANT_AUTO) {} inline Algorithm(Algo algo, Variant variant) : + m_flags(0), m_variant(variant) { setAlgo(algo); } - inline Algorithm(const char *algo) + inline Algorithm(const char *algo) : + m_flags(0) { parseAlgorithm(algo); } - bool isEqual(const Algorithm &other) const { return m_algo == other.m_algo && m_variant == other.m_variant; } - inline Algo algo() const { return m_algo; } - inline const char *name() const { return name(false); } - inline const char *shortName() const { return name(true); } - inline Variant variant() const { return m_variant; } - inline void setVariant(Variant variant) { m_variant = variant; } + inline Algo algo() const { return m_algo; } + inline bool isEqual(const Algorithm &other) const { return m_algo == other.m_algo && m_variant == other.m_variant; } + inline bool isForced() const { return m_flags & Forced; } + inline const char *name() const { return name(false); } + inline const char *shortName() const { return name(true); } + inline int flags() const { return m_flags; } + inline Variant variant() const { return m_variant; } + inline void setVariant(Variant variant) { m_variant = variant; } inline bool operator!=(const Algorithm &other) const { return !isEqual(other); } inline bool operator==(const Algorithm &other) const { return isEqual(other); } @@ -80,6 +90,7 @@ private: const char *name(bool shortName) const; Algo m_algo; + int m_flags; Variant m_variant; }; diff --git a/src/common/net/Job.cpp b/src/common/net/Job.cpp index 2eb84f18..7da2ed83 100644 --- a/src/common/net/Job.cpp +++ b/src/common/net/Job.cpp @@ -124,11 +124,14 @@ bool Job::setBlob(const char *blob) if (m_autoVariant) { m_algorithm.setVariant(variant()); } - else if (m_algorithm.variant() == xmrig::VARIANT_XTL && m_blob[0] >= 9) { - m_algorithm.setVariant(xmrig::VARIANT_HALF); - } - else if (m_algorithm.variant() == xmrig::VARIANT_MSR && m_blob[0] >= 8) { - m_algorithm.setVariant(xmrig::VARIANT_HALF); + + if (!m_algorithm.isForced()) { + if (m_algorithm.variant() == xmrig::VARIANT_XTL && m_blob[0] >= 9) { + m_algorithm.setVariant(xmrig::VARIANT_HALF); + } + else if (m_algorithm.variant() == xmrig::VARIANT_MSR && m_blob[0] >= 8) { + m_algorithm.setVariant(xmrig::VARIANT_HALF); + } } # ifdef XMRIG_PROXY_PROJECT From 8b9d5cff91fe8b1e2f169e11a5a1267644ebfb3d Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 14 Jan 2019 15:34:55 +0100 Subject: [PATCH 21/26] Added ASM code patching when loading For CNv2 variants with different iterations and memory size. --- src/Mem.h | 3 + src/Mem_unix.cpp | 16 + src/Mem_win.cpp | 12 + src/crypto/CryptoNight_x86.h | 8 +- .../cn_half_double_main_loop_sandybridge.inc | 410 ------------------ .../cn_half/cn_half_main_loop_bulldozer.inc | 180 -------- .../cn_half/cn_half_main_loop_ivybridge.inc | 186 -------- .../asm/cn_half/cn_half_main_loop_ryzen.inc | 179 -------- src/crypto/asm/cn_main_loop.S | 42 +- src/crypto/asm/cn_main_loop.asm | 32 +- .../cn_half_double_main_loop_sandybridge.inc | 410 ------------------ .../cn_half/cn_half_main_loop_bulldozer.inc | 180 -------- .../cn_half/cn_half_main_loop_ivybridge.inc | 186 -------- .../win64/cn_half/cn_half_main_loop_ryzen.inc | 179 -------- src/crypto/asm/win64/cn_main_loop.S | 31 +- src/crypto/asm/win64/cn_main_loop.asm | 32 +- src/workers/CpuThread.cpp | 56 +++ src/workers/CpuThread.h | 6 + src/workers/Workers.cpp | 4 + 19 files changed, 118 insertions(+), 2034 deletions(-) delete mode 100644 src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc delete mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc delete mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc delete mode 100644 src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc delete mode 100644 src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc diff --git a/src/Mem.h b/src/Mem.h index 6fd18fc1..0aa6eb4d 100644 --- a/src/Mem.h +++ b/src/Mem.h @@ -59,6 +59,9 @@ public: static void init(bool enabled); static void release(cryptonight_ctx **ctx, size_t count, MemInfo &info); + static void* allocate_executable_memory(size_t size); + static void FlushInstructionCache(void* p, size_t size); + static inline bool isHugepagesAvailable() { return (m_flags & HugepagesAvailable) != 0; } private: diff --git a/src/Mem_unix.cpp b/src/Mem_unix.cpp index c1aa0fb1..af7791bd 100644 --- a/src/Mem_unix.cpp +++ b/src/Mem_unix.cpp @@ -87,3 +87,19 @@ void Mem::release(MemInfo &info) _mm_free(info.memory); } } + + +void* Mem::allocate_executable_memory(size_t size) +{ +# if defined(__APPLE__) + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); +# else + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +# endif +} + + +void Mem::FlushInstructionCache(void* p, size_t size) +{ + __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); +} diff --git a/src/Mem_win.cpp b/src/Mem_win.cpp index 2bfcc3b0..2fad191d 100644 --- a/src/Mem_win.cpp +++ b/src/Mem_win.cpp @@ -182,3 +182,15 @@ void Mem::release(MemInfo &info) _mm_free(info.memory); } } + + +void* Mem::allocate_executable_memory(size_t size) +{ + return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); +} + + +void Mem::FlushInstructionCache(void* p, size_t size) +{ + ::FlushInstructionCache(GetCurrentProcess(), p, size); +} diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index fef3dc19..0c3fd52a 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -570,10 +570,10 @@ extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); -extern "C" void cn_half_mainloop_ivybridge_asm(cryptonight_ctx *ctx); -extern "C" void cn_half_mainloop_ryzen_asm(cryptonight_ctx *ctx); -extern "C" void cn_half_mainloop_bulldozer_asm(cryptonight_ctx *ctx); -extern "C" void cn_half_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); +extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm; +extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm; +extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_bulldozer_asm; +extern xmrig::CpuThread::cn_mainloop_double_fun cn_half_double_mainloop_sandybridge_asm; template diff --git a/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc b/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc deleted file mode 100644 index 2497ef95..00000000 --- a/src/crypto/asm/cn_half/cn_half_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 262144 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 16 -main_loop_double_half_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_half_sandybridge -div_fix_1_ret_half_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_half_sandybridge -div_fix_2_ret_half_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_half_sandybridge -sqrt_fix_1_ret_half_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_half_sandybridge -sqrt_fix_2_ret_half_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_half_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_half_sandybridge_endp - -div_fix_1_half_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_half_sandybridge - -div_fix_2_half_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_half_sandybridge - -sqrt_fix_1_half_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_half_sandybridge - -sqrt_fix_2_half_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_half_sandybridge - -cnv2_double_mainloop_asm_half_sandybridge_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc b/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc deleted file mode 100644 index 460f9b66..00000000 --- a/src/crypto/asm/cn_half/cn_half_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_half_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movq r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movq xmm0, rax - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_bulldozer - shr rdi, 19 - -sqrt_fixup_half_bulldozer_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_half_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_bulldozer_endp - -sqrt_fixup_half_bulldozer: - movq r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_bulldozer_ret - -cnv2_main_loop_half_bulldozer_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc b/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc deleted file mode 100644 index 51b82bec..00000000 --- a/src/crypto/asm/cn_half/cn_half_main_loop_ivybridge.inc +++ /dev/null @@ -1,186 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 262144 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movq xmm5, rax - - xor eax, eax - mov QWORD PTR [rsp+16], rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 16 -main_loop_half_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - psubq xmm3, XMMWORD PTR [rsp+16] - movq rdx, xmm3 - test edx, 524287 - je sqrt_fixup_half_ivybridge - psrlq xmm3, 19 -sqrt_fixup_half_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne main_loop_half_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp cnv2_main_loop_half_ivybridge_endp - -sqrt_fixup_half_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp sqrt_fixup_half_ivybridge_ret - -cnv2_main_loop_half_ivybridge_endp: diff --git a/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc b/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc deleted file mode 100644 index 8da3d8c4..00000000 --- a/src/crypto/asm/cn_half/cn_half_main_loop_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -main_loop_half_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_ryzen - shr rdi, 19 - -sqrt_fixup_half_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne main_loop_half_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_ryzen_endp - -sqrt_fixup_half_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_ryzen_ret - -cnv2_main_loop_half_ryzen_endp: diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index 95905d34..417fd414 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -12,11 +12,6 @@ .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_half_mainloop_ivybridge_asm) -.global FN_PREFIX(cn_half_mainloop_ryzen_asm) -.global FN_PREFIX(cn_half_mainloop_bulldozer_asm) -.global FN_PREFIX(cn_half_double_mainloop_sandybridge_asm) - ALIGN 16 FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 @@ -24,6 +19,7 @@ FN_PREFIX(cnv2_mainloop_ivybridge_asm): #include "cn2/cnv2_main_loop_ivybridge.inc" add rsp, 48 ret 0 + nop;nop;nop;nop; ALIGN 16 FN_PREFIX(cnv2_mainloop_ryzen_asm): @@ -32,6 +28,7 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): #include "cn2/cnv2_main_loop_ryzen.inc" add rsp, 48 ret 0 + nop;nop;nop;nop; ALIGN 16 FN_PREFIX(cnv2_mainloop_bulldozer_asm): @@ -40,6 +37,7 @@ FN_PREFIX(cnv2_mainloop_bulldozer_asm): #include "cn2/cnv2_main_loop_bulldozer.inc" add rsp, 48 ret 0 + nop;nop;nop;nop; ALIGN 16 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): @@ -49,36 +47,4 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): #include "cn2/cnv2_double_main_loop_sandybridge.inc" add rsp, 48 ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_mainloop_ivybridge_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_half/cn_half_main_loop_ivybridge.inc" - add rsp, 48 - ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_mainloop_ryzen_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_half/cn_half_main_loop_ryzen.inc" - add rsp, 48 - ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_mainloop_bulldozer_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_half/cn_half_main_loop_bulldozer.inc" - add rsp, 48 - ret 0 - -ALIGN 16 -FN_PREFIX(cn_half_double_mainloop_sandybridge_asm): - sub rsp, 48 - mov rcx, rdi - mov rdx, rsi - #include "cn_half/cn_half_double_main_loop_sandybridge.inc" - add rsp, 48 - ret 0 + nop;nop;nop;nop; diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm index fefb77a3..9d4cede0 100644 --- a/src/crypto/asm/cn_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -3,58 +3,34 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm -PUBLIC cn_half_mainloop_ivybridge_asm -PUBLIC cn_half_mainloop_ryzen_asm -PUBLIC cn_half_mainloop_bulldozer_asm -PUBLIC cn_half_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 + nop;nop;nop;nop; cnv2_double_mainloop_sandybridge_asm ENDP -ALIGN 64 -cn_half_mainloop_ivybridge_asm PROC - INCLUDE cn_half/cn_half_main_loop_ivybridge.inc - ret 0 -cn_half_mainloop_ivybridge_asm ENDP - -ALIGN 64 -cn_half_mainloop_ryzen_asm PROC - INCLUDE cn_half/cn_half_main_loop_ryzen.inc - ret 0 -cn_half_mainloop_ryzen_asm ENDP - -ALIGN 64 -cn_half_mainloop_bulldozer_asm PROC - INCLUDE cn_half/cn_half_main_loop_bulldozer.inc - ret 0 -cn_half_mainloop_bulldozer_asm ENDP - -ALIGN 64 -cn_half_double_mainloop_sandybridge_asm PROC - INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc - ret 0 -cn_half_double_mainloop_sandybridge_asm ENDP - _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc b/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc deleted file mode 100644 index 0c207f21..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 262144 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movd xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movd xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movd xmm5, QWORD PTR [r8+104] - movd xmm7, rax - - mov eax, 1 - shl rax, 52 - movd xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movd xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movd xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movd xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movd xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movd xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movd xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movd xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movd xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 16 -main_loop_double_half_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movd xmm0, r11 - movd xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movd r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movd xmm0, rbp - movd xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movd rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movd rdx, xmm5 - shl rdx, 32 - movd rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movd xmm0, rdx - xor rdx, [r11+r13] - movd xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movd r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movd r11, xmm0 - psrldq xmm1, 8 - movd r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movd rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movd rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movd r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_half_sandybridge -div_fix_1_ret_half_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_half_sandybridge -div_fix_2_ret_half_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movd r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_half_sandybridge -sqrt_fix_1_ret_half_sandybridge: - - movd r9, xmm10 - psrldq xmm1, 8 - movd r8, xmm1 - test r8, 524287 - je sqrt_fix_2_half_sandybridge -sqrt_fix_2_ret_half_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movd xmm0, rax - movd xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_half_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_half_sandybridge_endp - -div_fix_1_half_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_half_sandybridge - -div_fix_2_half_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_half_sandybridge - -sqrt_fix_1_half_sandybridge: - movd r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movd xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_half_sandybridge - -sqrt_fix_2_half_sandybridge: - psrldq xmm3, 8 - movd r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movd xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_half_sandybridge - -cnv2_double_mainloop_asm_half_sandybridge_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc deleted file mode 100644 index 6597c791..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_half_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movd r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movd xmm0, rax - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_bulldozer - shr rdi, 19 - -sqrt_fixup_half_bulldozer_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_half_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_bulldozer_endp - -sqrt_fixup_half_bulldozer: - movd r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_bulldozer_ret - -cnv2_main_loop_half_bulldozer_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc deleted file mode 100644 index c769f827..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ivybridge.inc +++ /dev/null @@ -1,186 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 262144 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movd xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movd xmm5, rax - - xor eax, eax - mov QWORD PTR [rsp+16], rax - - mov ax, 1023 - shl rax, 52 - movd xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movd xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 16 -main_loop_half_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movd xmm0, r11 - movd xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movd rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movd rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movd rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movd xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - psubq xmm3, XMMWORD PTR [rsp+16] - movd rdx, xmm3 - test edx, 524287 - je sqrt_fixup_half_ivybridge - psrlq xmm3, 19 -sqrt_fixup_half_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movd xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movd xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne main_loop_half_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp cnv2_main_loop_half_ivybridge_endp - -sqrt_fixup_half_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movd xmm3, rdx - jmp sqrt_fixup_half_ivybridge_ret - -cnv2_main_loop_half_ivybridge_endp: diff --git a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc b/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc deleted file mode 100644 index 0744aaa4..00000000 --- a/src/crypto/asm/win64/cn_half/cn_half_main_loop_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -main_loop_half_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm0, r11 - movd xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movd r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movd rax, xmm0 - - div r9 - movd xmm0, rax - movd xmm1, rdx - punpckldq xmm0, xmm1 - movd r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_half_ryzen - shr rdi, 19 - -sqrt_fixup_half_ryzen_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne main_loop_half_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_half_ryzen_endp - -sqrt_fixup_half_ryzen: - movd r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_half_ryzen_ret - -cnv2_main_loop_half_ryzen_endp: diff --git a/src/crypto/asm/win64/cn_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S index 691209f8..4caaa1a1 100644 --- a/src/crypto/asm/win64/cn_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -6,47 +6,26 @@ .global cnv2_mainloop_bulldozer_asm .global cnv2_double_mainloop_sandybridge_asm -.global cn_half_mainloop_ivybridge_asm -.global cn_half_mainloop_ryzen_asm -.global cn_half_mainloop_bulldozer_asm -.global cn_half_double_mainloop_sandybridge_asm - ALIGN 16 cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" ret 0 + nop;nop;nop;nop; ALIGN 16 cnv2_mainloop_ryzen_asm: #include "../cn2/cnv2_main_loop_ryzen.inc" ret 0 + nop;nop;nop;nop; ALIGN 16 cnv2_mainloop_bulldozer_asm: - #include "../cn2/cnv2_main_loop_bulldozer.inc" + #include "../cn2/cnv2_main_loop_bulldozer.inc" ret 0 + nop;nop;nop;nop; ALIGN 16 cnv2_double_mainloop_sandybridge_asm: #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 - -ALIGN 16 -cn_half_mainloop_ivybridge_asm: - #include "../cn_half/cn_half_main_loop_ivybridge.inc" - ret 0 - -ALIGN 16 -cn_half_mainloop_ryzen_asm: - #include "../cn_half/cn_half_main_loop_ryzen.inc" - ret 0 - -ALIGN 16 -cn_half_mainloop_bulldozer_asm: - #include "../cn_half/cn_half_main_loop_bulldozer.inc" - ret 0 - -ALIGN 16 -cn_half_double_mainloop_sandybridge_asm: - #include "../cn_half/cn_half_double_main_loop_sandybridge.inc" - ret 0 + nop;nop;nop;nop; diff --git a/src/crypto/asm/win64/cn_main_loop.asm b/src/crypto/asm/win64/cn_main_loop.asm index fefb77a3..9d4cede0 100644 --- a/src/crypto/asm/win64/cn_main_loop.asm +++ b/src/crypto/asm/win64/cn_main_loop.asm @@ -3,58 +3,34 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm -PUBLIC cn_half_mainloop_ivybridge_asm -PUBLIC cn_half_mainloop_ryzen_asm -PUBLIC cn_half_mainloop_bulldozer_asm -PUBLIC cn_half_double_mainloop_sandybridge_asm ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 + nop;nop;nop;nop; cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 + nop;nop;nop;nop; cnv2_double_mainloop_sandybridge_asm ENDP -ALIGN 64 -cn_half_mainloop_ivybridge_asm PROC - INCLUDE cn_half/cn_half_main_loop_ivybridge.inc - ret 0 -cn_half_mainloop_ivybridge_asm ENDP - -ALIGN 64 -cn_half_mainloop_ryzen_asm PROC - INCLUDE cn_half/cn_half_main_loop_ryzen.inc - ret 0 -cn_half_mainloop_ryzen_asm ENDP - -ALIGN 64 -cn_half_mainloop_bulldozer_asm PROC - INCLUDE cn_half/cn_half_main_loop_bulldozer.inc - ret 0 -cn_half_mainloop_bulldozer_asm ENDP - -ALIGN 64 -cn_half_double_mainloop_sandybridge_asm PROC - INCLUDE cn_half/cn_half_double_main_loop_sandybridge.inc - ret 0 -cn_half_double_mainloop_sandybridge_asm ENDP - _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 9a98b4e3..8f3457dc 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -31,6 +31,7 @@ #include "crypto/Asm.h" #include "rapidjson/document.h" #include "workers/CpuThread.h" +#include "Mem.h" #if defined(XMRIG_ARM) @@ -54,6 +55,61 @@ xmrig::CpuThread::CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiw } +#ifndef XMRIG_NO_ASM +template +static void patchCode(T& dst, U src, const uint32_t iterations, const uint32_t mask) +{ + const uint8_t* p = reinterpret_cast(src); + + size_t size = 0; + while (*(uint32_t*)(p + size) != 0x90909090) { + ++size; + } + + memcpy((void*) dst, (const void*) src, size); + + uint8_t* patched_data = reinterpret_cast(dst); + for (size_t i = 0; i + sizeof(uint32_t) <= size; ++i) { + switch (*(uint32_t*)(patched_data + i)) { + case xmrig::CRYPTONIGHT_ITER: + *(uint32_t*)(patched_data + i) = iterations; + break; + case xmrig::CRYPTONIGHT_MASK: + *(uint32_t*)(patched_data + i) = mask; + break; + } + } +} + +extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); +extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); + +xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm = nullptr; +xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm = nullptr; +xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_bulldozer_asm = nullptr; +xmrig::CpuThread::cn_mainloop_double_fun cn_half_double_mainloop_sandybridge_asm = nullptr; + +void xmrig::CpuThread::patchAsmVariants() +{ + const int allocation_size = 65536; + uint8_t* base = reinterpret_cast(Mem::allocate_executable_memory(allocation_size)); + + cn_half_mainloop_ivybridge_asm = reinterpret_cast (base + 0x0000); + cn_half_mainloop_ryzen_asm = reinterpret_cast (base + 0x1000); + cn_half_mainloop_bulldozer_asm = reinterpret_cast (base + 0x2000); + cn_half_double_mainloop_sandybridge_asm = reinterpret_cast (base + 0x3000); + + patchCode(cn_half_mainloop_ivybridge_asm, cnv2_mainloop_ivybridge_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + patchCode(cn_half_mainloop_ryzen_asm, cnv2_mainloop_ryzen_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + patchCode(cn_half_mainloop_bulldozer_asm, cnv2_mainloop_bulldozer_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + patchCode(cn_half_double_mainloop_sandybridge_asm, cnv2_double_mainloop_sandybridge_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); + + Mem::FlushInstructionCache(base, allocation_size); +} +#endif + bool xmrig::CpuThread::isSoftAES(AlgoVariant av) { return av == AV_SINGLE_SOFT || av == AV_DOUBLE_SOFT || av > AV_PENTA; diff --git a/src/workers/CpuThread.h b/src/workers/CpuThread.h index 71c3173d..e9d764da 100644 --- a/src/workers/CpuThread.h +++ b/src/workers/CpuThread.h @@ -60,6 +60,12 @@ public: CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiway multiway, int64_t affinity, int priority, bool softAES, bool prefetch, Assembly assembly); typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, cryptonight_ctx **ctx); + typedef void (*cn_mainloop_fun)(cryptonight_ctx *ctx); + typedef void (*cn_mainloop_double_fun)(cryptonight_ctx *ctx1, cryptonight_ctx *ctx2); + +# ifndef XMRIG_NO_ASM + static void patchAsmVariants(); +# endif static bool isSoftAES(AlgoVariant av); static cn_hash_fun fn(Algo algorithm, AlgoVariant av, Variant variant, Assembly assembly); diff --git a/src/workers/Workers.cpp b/src/workers/Workers.cpp index a5109e9b..e285005e 100644 --- a/src/workers/Workers.cpp +++ b/src/workers/Workers.cpp @@ -168,6 +168,10 @@ void Workers::start(xmrig::Controller *controller) LOG_NOTICE("--------------------------------------------------------------------------"); # endif +# ifndef XMRIG_NO_ASM + xmrig::CpuThread::patchAsmVariants(); +# endif + m_controller = controller; const std::vector &threads = controller->config()->threads(); From 56cacbd5bcfc6e306ebab90190c1fa7407325b00 Mon Sep 17 00:00:00 2001 From: SChernykh Date: Mon, 14 Jan 2019 16:38:28 +0100 Subject: [PATCH 22/26] Fixes for Visual Studio --- src/crypto/asm/cn_main_loop.S | 20 ++++++++++++++++---- src/crypto/asm/cn_main_loop.asm | 20 ++++++++++++++++---- src/crypto/asm/win64/cn_main_loop.S | 20 ++++++++++++++++---- src/crypto/asm/win64/cn_main_loop.asm | 20 ++++++++++++++++---- src/workers/CpuThread.cpp | 7 ++++++- 5 files changed, 70 insertions(+), 17 deletions(-) diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index 417fd414..e9ac64f5 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -19,7 +19,10 @@ FN_PREFIX(cnv2_mainloop_ivybridge_asm): #include "cn2/cnv2_main_loop_ivybridge.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 FN_PREFIX(cnv2_mainloop_ryzen_asm): @@ -28,7 +31,10 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): #include "cn2/cnv2_main_loop_ryzen.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 FN_PREFIX(cnv2_mainloop_bulldozer_asm): @@ -37,7 +43,10 @@ FN_PREFIX(cnv2_mainloop_bulldozer_asm): #include "cn2/cnv2_main_loop_bulldozer.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): @@ -47,4 +56,7 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): #include "cn2/cnv2_double_main_loop_sandybridge.inc" add rsp, 48 ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm index 9d4cede0..9c8a6ea9 100644 --- a/src/crypto/asm/cn_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -8,28 +8,40 @@ ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_double_mainloop_sandybridge_asm ENDP _TEXT_CNV2_MAINLOOP ENDS diff --git a/src/crypto/asm/win64/cn_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S index 4caaa1a1..ea5a63b8 100644 --- a/src/crypto/asm/win64/cn_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -10,22 +10,34 @@ ALIGN 16 cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 cnv2_mainloop_ryzen_asm: #include "../cn2/cnv2_main_loop_ryzen.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 cnv2_mainloop_bulldozer_asm: #include "../cn2/cnv2_main_loop_bulldozer.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop ALIGN 16 cnv2_double_mainloop_sandybridge_asm: #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop diff --git a/src/crypto/asm/win64/cn_main_loop.asm b/src/crypto/asm/win64/cn_main_loop.asm index 9d4cede0..9c8a6ea9 100644 --- a/src/crypto/asm/win64/cn_main_loop.asm +++ b/src/crypto/asm/win64/cn_main_loop.asm @@ -8,28 +8,40 @@ ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ivybridge_asm ENDP ALIGN 64 cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_ryzen_asm ENDP ALIGN 64 cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_mainloop_bulldozer_asm ENDP ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 - nop;nop;nop;nop; + nop + nop + nop + nop cnv2_double_mainloop_sandybridge_asm ENDP _TEXT_CNV2_MAINLOOP ENDS diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index 8f3457dc..cf366860 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -57,10 +57,15 @@ xmrig::CpuThread::CpuThread(size_t index, Algo algorithm, AlgoVariant av, Multiw #ifndef XMRIG_NO_ASM template -static void patchCode(T& dst, U src, const uint32_t iterations, const uint32_t mask) +static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t mask) { const uint8_t* p = reinterpret_cast(src); + // Workaround for Visual Studio placing trampoline in debug builds + if (p[0] == 0xE9) { + p += *(int32_t*)(p + 1) + 5; + } + size_t size = 0; while (*(uint32_t*)(p + size) != 0x90909090) { ++size; From 17f28667b3875fe6e1e5d53572f6f97fc6a9b035 Mon Sep 17 00:00:00 2001 From: XMRig Date: Tue, 15 Jan 2019 02:15:36 +0700 Subject: [PATCH 23/26] Code-style/copyright cleanup. --- src/Mem.cpp | 3 ++- src/Mem.h | 13 +++++++------ src/Mem_unix.cpp | 7 ++++--- src/Mem_win.cpp | 7 ++++--- src/crypto/CryptoNight_x86.h | 4 ++-- src/workers/CpuThread.cpp | 19 +++++++++++++------ src/workers/CpuThread.h | 1 + src/workers/Workers.cpp | 3 ++- src/workers/Workers.h | 9 +++++---- 9 files changed, 40 insertions(+), 26 deletions(-) diff --git a/src/Mem.cpp b/src/Mem.cpp index bb2da646..1e7e1e3c 100644 --- a/src/Mem.cpp +++ b/src/Mem.cpp @@ -6,7 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/Mem.h b/src/Mem.h index 0aa6eb4d..21616a40 100644 --- a/src/Mem.h +++ b/src/Mem.h @@ -6,7 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -22,8 +23,8 @@ * along with this program. If not, see . */ -#ifndef __MEM_H__ -#define __MEM_H__ +#ifndef XMRIG_MEM_H +#define XMRIG_MEM_H #include @@ -59,8 +60,8 @@ public: static void init(bool enabled); static void release(cryptonight_ctx **ctx, size_t count, MemInfo &info); - static void* allocate_executable_memory(size_t size); - static void FlushInstructionCache(void* p, size_t size); + static void *allocateExecutableMemory(size_t size); + static void flushInstructionCache(void *p, size_t size); static inline bool isHugepagesAvailable() { return (m_flags & HugepagesAvailable) != 0; } @@ -73,4 +74,4 @@ private: }; -#endif /* __MEM_H__ */ +#endif /* XMRIG_MEM_H */ diff --git a/src/Mem_unix.cpp b/src/Mem_unix.cpp index af7791bd..7db761ae 100644 --- a/src/Mem_unix.cpp +++ b/src/Mem_unix.cpp @@ -6,7 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -89,7 +90,7 @@ void Mem::release(MemInfo &info) } -void* Mem::allocate_executable_memory(size_t size) +void *Mem::allocateExecutableMemory(size_t size) { # if defined(__APPLE__) return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); @@ -99,7 +100,7 @@ void* Mem::allocate_executable_memory(size_t size) } -void Mem::FlushInstructionCache(void* p, size_t size) +void Mem::flushInstructionCache(void *p, size_t size) { __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); } diff --git a/src/Mem_win.cpp b/src/Mem_win.cpp index 2fad191d..c43b2ce4 100644 --- a/src/Mem_win.cpp +++ b/src/Mem_win.cpp @@ -6,7 +6,8 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -184,13 +185,13 @@ void Mem::release(MemInfo &info) } -void* Mem::allocate_executable_memory(size_t size) +void *Mem::allocateExecutableMemory(size_t size) { return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); } -void Mem::FlushInstructionCache(void* p, size_t size) +void Mem::flushInstructionCache(void *p, size_t size) { ::FlushInstructionCache(GetCurrentProcess(), p, size); } diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 0c3fd52a..d3ff25bc 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -6,7 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2018 SChernykh + * Copyright 2018-2019 SChernykh * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -568,7 +568,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); -extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); +extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx *ctx0, cryptonight_ctx *ctx1); extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm; extern xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm; diff --git a/src/workers/CpuThread.cpp b/src/workers/CpuThread.cpp index cf366860..5b7016e4 100644 --- a/src/workers/CpuThread.cpp +++ b/src/workers/CpuThread.cpp @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2018 SChernykh + * Copyright 2018-2019 SChernykh * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -29,9 +29,9 @@ #include "common/log/Log.h" #include "common/net/Pool.h" #include "crypto/Asm.h" +#include "Mem.h" #include "rapidjson/document.h" #include "workers/CpuThread.h" -#include "Mem.h" #if defined(XMRIG_ARM) @@ -61,10 +61,12 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma { const uint8_t* p = reinterpret_cast(src); - // Workaround for Visual Studio placing trampoline in debug builds + // Workaround for Visual Studio placing trampoline in debug builds. +# if defined(_MSC_VER) if (p[0] == 0xE9) { p += *(int32_t*)(p + 1) + 5; } +# endif size_t size = 0; while (*(uint32_t*)(p + size) != 0x90909090) { @@ -79,6 +81,7 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma case xmrig::CRYPTONIGHT_ITER: *(uint32_t*)(patched_data + i) = iterations; break; + case xmrig::CRYPTONIGHT_MASK: *(uint32_t*)(patched_data + i) = mask; break; @@ -86,20 +89,23 @@ static void patchCode(T dst, U src, const uint32_t iterations, const uint32_t ma } } + extern "C" void cnv2_mainloop_ivybridge_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_ryzen_asm(cryptonight_ctx *ctx); extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx *ctx); -extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx* ctx0, cryptonight_ctx* ctx1); +extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx *ctx0, cryptonight_ctx *ctx1); + xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ivybridge_asm = nullptr; xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_ryzen_asm = nullptr; xmrig::CpuThread::cn_mainloop_fun cn_half_mainloop_bulldozer_asm = nullptr; xmrig::CpuThread::cn_mainloop_double_fun cn_half_double_mainloop_sandybridge_asm = nullptr; + void xmrig::CpuThread::patchAsmVariants() { const int allocation_size = 65536; - uint8_t* base = reinterpret_cast(Mem::allocate_executable_memory(allocation_size)); + uint8_t *base = static_cast(Mem::allocateExecutableMemory(allocation_size)); cn_half_mainloop_ivybridge_asm = reinterpret_cast (base + 0x0000); cn_half_mainloop_ryzen_asm = reinterpret_cast (base + 0x1000); @@ -111,10 +117,11 @@ void xmrig::CpuThread::patchAsmVariants() patchCode(cn_half_mainloop_bulldozer_asm, cnv2_mainloop_bulldozer_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); patchCode(cn_half_double_mainloop_sandybridge_asm, cnv2_double_mainloop_sandybridge_asm, xmrig::CRYPTONIGHT_HALF_ITER, xmrig::CRYPTONIGHT_MASK); - Mem::FlushInstructionCache(base, allocation_size); + Mem::flushInstructionCache(base, allocation_size); } #endif + bool xmrig::CpuThread::isSoftAES(AlgoVariant av) { return av == AV_SINGLE_SOFT || av == AV_DOUBLE_SOFT || av > AV_PENTA; diff --git a/src/workers/CpuThread.h b/src/workers/CpuThread.h index e9d764da..a31be058 100644 --- a/src/workers/CpuThread.h +++ b/src/workers/CpuThread.h @@ -5,6 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , + * Copyright 2018-2019 SChernykh * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify diff --git a/src/workers/Workers.cpp b/src/workers/Workers.cpp index e285005e..d6201ebf 100644 --- a/src/workers/Workers.cpp +++ b/src/workers/Workers.cpp @@ -5,7 +5,8 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by diff --git a/src/workers/Workers.h b/src/workers/Workers.h index 1d619cea..61868819 100644 --- a/src/workers/Workers.h +++ b/src/workers/Workers.h @@ -5,7 +5,8 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,8 +22,8 @@ * along with this program. If not, see . */ -#ifndef __WORKERS_H__ -#define __WORKERS_H__ +#ifndef XMRIG_WORKERS_H +#define XMRIG_WORKERS_H #include @@ -118,4 +119,4 @@ private: }; -#endif /* __WORKERS_H__ */ +#endif /* XMRIG_WORKERS_H */ From a98c475a3c0aba42e984fc163c5a9b1a4ce0a3ef Mon Sep 17 00:00:00 2001 From: XMRig Date: Tue, 15 Jan 2019 18:18:04 +0700 Subject: [PATCH 24/26] Fixed wrong ASM code alignment on macOS, thanks @SChernykh. --- .../asm/cn2/cnv2_double_main_loop_sandybridge.inc | 2 +- src/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc | 2 +- src/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc | 2 +- src/crypto/asm/cn2/cnv2_main_loop_ryzen.inc | 2 +- src/crypto/asm/cn_main_loop.S | 14 +++++++++----- src/crypto/asm/cn_main_loop.asm | 8 ++++---- .../cn2/cnv2_double_main_loop_sandybridge.inc | 2 +- .../asm/win64/cn2/cnv2_main_loop_bulldozer.inc | 2 +- .../asm/win64/cn2/cnv2_main_loop_ivybridge.inc | 2 +- src/crypto/asm/win64/cn2/cnv2_main_loop_ryzen.inc | 2 +- src/crypto/asm/win64/cn_main_loop.S | 10 +++++----- 11 files changed, 26 insertions(+), 22 deletions(-) diff --git a/src/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc index e8251bc7..aa5101a8 100644 --- a/src/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc +++ b/src/crypto/asm/cn2/cnv2_double_main_loop_sandybridge.inc @@ -94,7 +94,7 @@ lea r9, QWORD PTR [rdx+r13] movdqu xmm15, XMMWORD PTR [r9] - ALIGN 16 + ALIGN(64) main_loop_double_sandybridge: movdqu xmm9, xmm15 mov eax, edx diff --git a/src/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc index 478976c0..c764501d 100644 --- a/src/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc +++ b/src/crypto/asm/cn2/cnv2_main_loop_bulldozer.inc @@ -45,7 +45,7 @@ movq xmm0, rcx punpcklqdq xmm4, xmm0 - ALIGN 16 + ALIGN(64) cnv2_main_loop_bulldozer: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm6, r8 diff --git a/src/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc index 8c2c2d3b..06f1d28b 100644 --- a/src/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc +++ b/src/crypto/asm/cn2/cnv2_main_loop_ivybridge.inc @@ -50,7 +50,7 @@ punpcklqdq xmm5, xmm0 movdqu xmm6, XMMWORD PTR [r10+rbx] - ALIGN 16 + ALIGN(64) main_loop_ivybridge: lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d diff --git a/src/crypto/asm/cn2/cnv2_main_loop_ryzen.inc b/src/crypto/asm/cn2/cnv2_main_loop_ryzen.inc index d386aa2d..5dbf5917 100644 --- a/src/crypto/asm/cn2/cnv2_main_loop_ryzen.inc +++ b/src/crypto/asm/cn2/cnv2_main_loop_ryzen.inc @@ -45,7 +45,7 @@ movq xmm0, rcx punpcklqdq xmm4, xmm0 - ALIGN 16 + ALIGN(64) main_loop_ryzen: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm0, r11 diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index e9ac64f5..1e5610d1 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -1,4 +1,8 @@ -#define ALIGN .align +#ifdef __APPLE__ +# define ALIGN(x) .align 6 +#else +# define ALIGN(x) .align 64 +#endif .intel_syntax noprefix #ifdef __APPLE__ # define FN_PREFIX(fn) _ ## fn @@ -12,7 +16,7 @@ .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) -ALIGN 16 +ALIGN(64) FN_PREFIX(cnv2_mainloop_ivybridge_asm): sub rsp, 48 mov rcx, rdi @@ -24,7 +28,7 @@ FN_PREFIX(cnv2_mainloop_ivybridge_asm): nop nop -ALIGN 16 +ALIGN(64) FN_PREFIX(cnv2_mainloop_ryzen_asm): sub rsp, 48 mov rcx, rdi @@ -36,7 +40,7 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): nop nop -ALIGN 16 +ALIGN(64) FN_PREFIX(cnv2_mainloop_bulldozer_asm): sub rsp, 48 mov rcx, rdi @@ -48,7 +52,7 @@ FN_PREFIX(cnv2_mainloop_bulldozer_asm): nop nop -ALIGN 16 +ALIGN(64) FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): sub rsp, 48 mov rcx, rdi diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm index 9c8a6ea9..47b4df9e 100644 --- a/src/crypto/asm/cn_main_loop.asm +++ b/src/crypto/asm/cn_main_loop.asm @@ -4,7 +4,7 @@ PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm -ALIGN 64 +ALIGN(64) cnv2_mainloop_ivybridge_asm PROC INCLUDE cn2/cnv2_main_loop_ivybridge.inc ret 0 @@ -14,7 +14,7 @@ cnv2_mainloop_ivybridge_asm PROC nop cnv2_mainloop_ivybridge_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_mainloop_ryzen_asm PROC INCLUDE cn2/cnv2_main_loop_ryzen.inc ret 0 @@ -24,7 +24,7 @@ cnv2_mainloop_ryzen_asm PROC nop cnv2_mainloop_ryzen_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_mainloop_bulldozer_asm PROC INCLUDE cn2/cnv2_main_loop_bulldozer.inc ret 0 @@ -34,7 +34,7 @@ cnv2_mainloop_bulldozer_asm PROC nop cnv2_mainloop_bulldozer_asm ENDP -ALIGN 64 +ALIGN(64) cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc ret 0 diff --git a/src/crypto/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc index 44ea8923..05af9393 100644 --- a/src/crypto/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc +++ b/src/crypto/asm/win64/cn2/cnv2_double_main_loop_sandybridge.inc @@ -94,7 +94,7 @@ lea r9, QWORD PTR [rdx+r13] movdqu xmm15, XMMWORD PTR [r9] - ALIGN 16 + ALIGN(64) main_loop_double_sandybridge: movdqu xmm9, xmm15 mov eax, edx diff --git a/src/crypto/asm/win64/cn2/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/win64/cn2/cnv2_main_loop_bulldozer.inc index c19e9d69..03a36f48 100644 --- a/src/crypto/asm/win64/cn2/cnv2_main_loop_bulldozer.inc +++ b/src/crypto/asm/win64/cn2/cnv2_main_loop_bulldozer.inc @@ -45,7 +45,7 @@ movd xmm0, rcx punpcklqdq xmm4, xmm0 - ALIGN 16 + ALIGN(64) cnv2_main_loop_bulldozer: movdqa xmm5, XMMWORD PTR [r10+rbx] movd xmm6, r8 diff --git a/src/crypto/asm/win64/cn2/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/win64/cn2/cnv2_main_loop_ivybridge.inc index c925ca24..77e28f80 100644 --- a/src/crypto/asm/win64/cn2/cnv2_main_loop_ivybridge.inc +++ b/src/crypto/asm/win64/cn2/cnv2_main_loop_ivybridge.inc @@ -50,7 +50,7 @@ punpcklqdq xmm5, xmm0 movdqu xmm6, XMMWORD PTR [r10+rbx] - ALIGN 16 + ALIGN(64) main_loop_ivybridge: lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d diff --git a/src/crypto/asm/win64/cn2/cnv2_main_loop_ryzen.inc b/src/crypto/asm/win64/cn2/cnv2_main_loop_ryzen.inc index d1cd26c4..7e5c127f 100644 --- a/src/crypto/asm/win64/cn2/cnv2_main_loop_ryzen.inc +++ b/src/crypto/asm/win64/cn2/cnv2_main_loop_ryzen.inc @@ -45,7 +45,7 @@ movd xmm0, rcx punpcklqdq xmm4, xmm0 - ALIGN 16 + ALIGN(64) main_loop_ryzen: movdqa xmm5, XMMWORD PTR [r10+rbx] movd xmm0, r11 diff --git a/src/crypto/asm/win64/cn_main_loop.S b/src/crypto/asm/win64/cn_main_loop.S index ea5a63b8..90e43470 100644 --- a/src/crypto/asm/win64/cn_main_loop.S +++ b/src/crypto/asm/win64/cn_main_loop.S @@ -1,4 +1,4 @@ -#define ALIGN .align +#define ALIGN(x) .align 64 .intel_syntax noprefix .section .text .global cnv2_mainloop_ivybridge_asm @@ -6,7 +6,7 @@ .global cnv2_mainloop_bulldozer_asm .global cnv2_double_mainloop_sandybridge_asm -ALIGN 16 +ALIGN(64) cnv2_mainloop_ivybridge_asm: #include "../cn2/cnv2_main_loop_ivybridge.inc" ret 0 @@ -15,7 +15,7 @@ cnv2_mainloop_ivybridge_asm: nop nop -ALIGN 16 +ALIGN(64) cnv2_mainloop_ryzen_asm: #include "../cn2/cnv2_main_loop_ryzen.inc" ret 0 @@ -24,7 +24,7 @@ cnv2_mainloop_ryzen_asm: nop nop -ALIGN 16 +ALIGN(64) cnv2_mainloop_bulldozer_asm: #include "../cn2/cnv2_main_loop_bulldozer.inc" ret 0 @@ -33,7 +33,7 @@ cnv2_mainloop_bulldozer_asm: nop nop -ALIGN 16 +ALIGN(64) cnv2_double_mainloop_sandybridge_asm: #include "../cn2/cnv2_double_main_loop_sandybridge.inc" ret 0 From 09893bfd36ef8ef8eda01969162f23bf2dfd0bab Mon Sep 17 00:00:00 2001 From: XMRig Date: Tue, 15 Jan 2019 18:28:35 +0700 Subject: [PATCH 25/26] Fix warnings on macOS. --- src/base/tools/String.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/base/tools/String.cpp b/src/base/tools/String.cpp index fe2792c7..7ed61d01 100644 --- a/src/base/tools/String.cpp +++ b/src/base/tools/String.cpp @@ -5,7 +5,7 @@ * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , - * Copyright 2016-2018 XMRig , + * Copyright 2016-2019 XMRig , * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -113,7 +113,7 @@ std::vector xmrig::String::split(char sep) const for (pos = 0; pos < m_size; ++pos) { if (m_data[pos] == sep) { if ((pos - start) > 0) { - out.push_back(std::move(String(m_data + start, pos - start))); + out.push_back(String(m_data + start, pos - start)); } start = pos + 1; @@ -121,7 +121,7 @@ std::vector xmrig::String::split(char sep) const } if ((pos - start) > 0) { - out.push_back(std::move(String(m_data + start, pos - start))); + out.push_back(String(m_data + start, pos - start)); } return out; From dd3243aa70e42fa0e4d2b4e867b91a8224e2016e Mon Sep 17 00:00:00 2001 From: xmrig Date: Wed, 16 Jan 2019 00:28:49 +0700 Subject: [PATCH 26/26] Update CHANGELOG.md --- CHANGELOG.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index cbab8e4a..c42bdb4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +# v2.9.0 +- [#899](https://github.com/xmrig/xmrig/issues/899) Added support for new algorithm `cn/half` for Masari and Stellite forks. +- [#834](https://github.com/xmrig/xmrig/pull/834) Added ASM optimized code for AMD Bulldozer. +- [#839](https://github.com/xmrig/xmrig/issues/839) Fixed FreeBSD compile. +- [#857](https://github.com/xmrig/xmrig/pull/857) Fixed impossible to build for macOS without clang. + # v2.8.3 - [#813](https://github.com/xmrig/xmrig/issues/813) Fixed critical bug with Minergate pool and variant 2.