diff --git a/src/crypto/cn/CnHash.cpp b/src/crypto/cn/CnHash.cpp index 0d9a9b37..f6a77f6a 100644 --- a/src/crypto/cn/CnHash.cpp +++ b/src/crypto/cn/CnHash.cpp @@ -293,12 +293,6 @@ xmrig::CnHash::CnHash() # ifdef XMRIG_ALGO_CN_FEMTO ADD_FN(Algorithm::CN_UPX2); ADD_FN_ASM(Algorithm::CN_UPX2); - -# if defined(_MSC_VER) && defined(XMRIG_FEATURE_ASM) - // This is somehow faster on Ryzen - m_map[Algorithm::CN_UPX2][AV_DOUBLE][Assembly::RYZEN] = cryptonight_double_hash; -# endif - # endif # ifdef XMRIG_ALGO_ARGON2 diff --git a/src/crypto/cn/CryptoNight_x86.h b/src/crypto/cn/CryptoNight_x86.h index 0727b9dc..25eeb908 100644 --- a/src/crypto/cn/CryptoNight_x86.h +++ b/src/crypto/cn/CryptoNight_x86.h @@ -789,6 +789,7 @@ extern "C" void cnv2_mainloop_bulldozer_asm(cryptonight_ctx **ctx); extern "C" void cnv2_double_mainloop_sandybridge_asm(cryptonight_ctx **ctx); extern "C" void cnv2_rwz_mainloop_asm(cryptonight_ctx **ctx); extern "C" void cnv2_rwz_double_mainloop_asm(cryptonight_ctx **ctx); +extern "C" void cnv2_upx_double_mainloop_zen3_asm(cryptonight_ctx * *ctx); namespace xmrig { @@ -986,7 +987,12 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_ # endif # ifdef XMRIG_ALGO_CN_FEMTO else if (ALGO == Algorithm::CN_UPX2) { - cn_upx2_double_mainloop_asm(ctx); + if (Cpu::info()->arch() == ICpuInfo::ARCH_ZEN3) { + cnv2_upx_double_mainloop_zen3_asm(ctx); + } + else { + cn_upx2_double_mainloop_asm(ctx); + } } # endif else if (ALGO == Algorithm::CN_RWZ) { diff --git a/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc b/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc new file mode 100644 index 00000000..2035f866 --- /dev/null +++ b/src/crypto/cn/asm/cn2/cnv2_upx_double_mainloop_zen3.inc @@ -0,0 +1,322 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 216 + + mov rdi, QWORD PTR [rcx+8] + + mov edx, 768 + mov rbx, QWORD PTR [rcx] + mov ecx, 256 + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + mov r13, QWORD PTR [rdi+224] + movq xmm0, QWORD PTR [rdi+104] + mov r12, QWORD PTR [rbx+224] + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rax-136], xmm9 + movaps XMMWORD PTR [rax-152], xmm10 + movaps XMMWORD PTR [rsp+112], xmm11 + movaps XMMWORD PTR [rsp+96], xmm12 + movaps XMMWORD PTR [rsp+80], xmm13 + movq xmm13, QWORD PTR [rbx+96] + movaps XMMWORD PTR [rsp+64], xmm14 + movq xmm14, QWORD PTR [rbx+104] + movaps XMMWORD PTR [rsp+48], xmm15 + movq xmm15, QWORD PTR [rdi+96] + mov QWORD PTR [rsp], r13 + movdqa XMMWORD PTR [rsp+32], xmm0 + + stmxcsr DWORD PTR [rsp+24] + mov DWORD PTR [rsp+28], 24448 + ldmxcsr DWORD PTR [rsp+28] + + mov rcx, QWORD PTR [rbx+56] + xorps xmm12, xmm12 + xor rcx, QWORD PTR [rbx+24] + mov rax, QWORD PTR [rbx+48] + xor rax, QWORD PTR [rbx+16] + mov rsi, QWORD PTR [rbx+32] + mov rbp, QWORD PTR [rdi+32] + movq xmm0, rcx + + mov rcx, QWORD PTR [rbx+88] + xor rcx, QWORD PTR [rbx+72] + movq xmm7, rax + mov rax, QWORD PTR [rbx+80] + xor rax, QWORD PTR [rbx+64] + mov r14, QWORD PTR [rbx+40] + mov r15, QWORD PTR [rdi+40] + xor rsi, QWORD PTR [rbx] + xor rbp, QWORD PTR [rdi] + movq xmm9, rax + + mov rax, QWORD PTR [rdi+48] + xor rax, QWORD PTR [rdi+16] + xor r14, QWORD PTR [rbx+8] + xor r15, QWORD PTR [rdi+8] + movq xmm8, rax + punpcklqdq xmm7, xmm0 + + mov eax, 1023 + shl rax, 52 + movq xmm11, rax + punpcklqdq xmm11, xmm11 + + mov rax, QWORD PTR [rdi+80] + movq xmm0, rcx + mov rcx, QWORD PTR [rdi+56] + xor rcx, QWORD PTR [rdi+24] + punpcklqdq xmm9, xmm0 + mov QWORD PTR [rsp+8], 16384 + movq xmm0, rcx + mov rcx, QWORD PTR [rdi+88] + xor rcx, QWORD PTR [rdi+72] + xor rax, QWORD PTR [rdi+64] + punpcklqdq xmm8, xmm0 + movq xmm0, rcx + movq xmm10, rax + mov rax, 4389456576511 + mov QWORD PTR [rsp+16], rax + punpcklqdq xmm10, xmm0 + + ALIGN(64) +upx2_main_loop: + mov rdx, rsi + mov r9, rbp + and edx, 131056 + and r9d, 131056 + movdqu xmm6, XMMWORD PTR [rdx+r12] + lea r8, QWORD PTR [rdx+r12] + movdqu xmm4, XMMWORD PTR [r9+r13] + lea r10, QWORD PTR [r9+r13] + mov ecx, edx + mov eax, edx + xor rax, 32 + xor rcx, 48 + xor rdx, 16 + movq xmm0, r14 + movq xmm3, rsi + movq xmm5, rbp + punpcklqdq xmm3, xmm0 + movq xmm0, r15 + movdqu xmm2, XMMWORD PTR [rax+r12] + movdqu xmm1, XMMWORD PTR [rcx+r12] + paddq xmm2, xmm3 + punpcklqdq xmm5, xmm0 + paddq xmm1, xmm7 + aesenc xmm6, xmm3 + aesenc xmm4, xmm5 + movdqa xmm0, xmm9 + movq rdi, xmm4 + paddq xmm0, XMMWORD PTR [rdx+r12] + movdqu XMMWORD PTR [rdx+r12], xmm0 + xor edx, edx + movdqu XMMWORD PTR [rax+r12], xmm1 + movdqa xmm0, xmm6 + movdqu XMMWORD PTR [rcx+r12], xmm2 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r8], xmm0 + mov ecx, r9d + xor rcx, 48 + mov eax, r9d + xor rax, 32 + xor r9, 16 + movdqa xmm0, xmm10 + movdqu xmm1, XMMWORD PTR [rcx+r13] + movdqu xmm2, XMMWORD PTR [rax+r13] + paddq xmm1, xmm8 + paddq xmm0, XMMWORD PTR [r9+r13] + paddq xmm2, xmm5 + movdqu XMMWORD PTR [r9+r13], xmm0 + movq r9, xmm6 + movdqu XMMWORD PTR [rax+r13], xmm1 + movdqa xmm0, xmm4 + movdqu XMMWORD PTR [rcx+r13], xmm2 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [r10], xmm0 + movq rcx, xmm14 + mov rax, rcx + movq r10, xmm13 + shl rax, 32 + movdqa xmm0, xmm6 + xor r10, rax + psrldq xmm0, 8 + lea r8, QWORD PTR [rcx+rcx] + movq rax, xmm0 + add r8d, r9d + mov ecx, -2147483647 + or r8, rcx + mov r11, r9 + div r8 + and r11d, 131056 + movaps xmm1, xmm12 + mov eax, eax + add r11, r12 + shl rdx, 32 + add rdx, rax + xor r10, QWORD PTR [r11] + mov rbx, QWORD PTR [r11+8] + lea r8, QWORD PTR [rdx+r9] + movq xmm13, rdx + mov rax, r8 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm11 + sqrtsd xmm1, xmm0 + mov r13, -4389456576512 + movq rdx, xmm1 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + sub rcx, QWORD PTR [rsp+16] + mov r13, QWORD PTR [rsp] + imul rcx, rax + mov rax, r10 + sub rcx, r8 + mov rcx, r9 + adc rdx, 0 + xor rcx, 32 + and ecx, 131056 + movq xmm14, rdx + movdqu xmm1, XMMWORD PTR [rcx+r12] + mul r9 + paddq xmm1, xmm3 + mov r8, rax + xor r8, QWORD PTR [rcx+r12+8] + add r14, r8 + movq xmm0, rax + movq xmm2, rdx + xor rdx, QWORD PTR [rcx+r12] + mov rax, r9 + xor rax, 48 + punpcklqdq xmm2, xmm0 + and eax, 131056 + add rsi, rdx + xor r9, 16 + xor edx, edx + and r9d, 131056 + movdqu xmm0, XMMWORD PTR [rax+r12] + paddq xmm0, xmm7 + pxor xmm2, XMMWORD PTR [r9+r12] + paddq xmm2, xmm9 + movdqu XMMWORD PTR [r9+r12], xmm2 + movq r9, xmm15 + movdqu XMMWORD PTR [rcx+r12], xmm0 + movdqa xmm0, xmm4 + mov rcx, QWORD PTR [rsp+32] + movdqu XMMWORD PTR [rax+r12], xmm1 + mov rax, rcx + shl rax, 32 + movaps xmm1, xmm12 + xor r9, rax + psrldq xmm0, 8 + lea r8, QWORD PTR [rcx+rcx] + mov QWORD PTR [r11], rsi + add r8d, edi + mov QWORD PTR [r11+8], r14 + movq rax, xmm0 + mov ecx, -2147483647 + or r8, rcx + xor rsi, r10 + div r8 + mov r10, rdi + xor r14, rbx + mov eax, eax + and r10d, 131056 + shl rdx, 32 + add r10, r13 + add rdx, rax + xor r9, QWORD PTR [r10] + mov r11, QWORD PTR [r10+8] + lea r8, QWORD PTR [rdx+rdi] + mov rax, r8 + movq xmm15, rdx + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm11 + sqrtsd xmm1, xmm0 + movq rdx, xmm1 + mov rax, rdx + shr rax, 20 + shr rdx, 19 + mov rcx, rdx + sub rcx, rax + mov rbx, 4389456576511 + sub rcx, rbx + movdqa xmm9, xmm7 + mov rbx, -4389456576512 + movdqa xmm7, xmm6 + add rax, rbx + imul rcx, rax + mov rax, r9 + sub rcx, r8 + mov rcx, rdi + adc rdx, 0 + xor rcx, 32 + and ecx, 131056 + movq xmm0, rdx + movdqu xmm1, XMMWORD PTR [rcx+r13] + mul rdi + movdqa XMMWORD PTR [rsp+32], xmm0 + paddq xmm1, xmm5 + mov r8, rax + xor r8, QWORD PTR [rcx+r13+8] + add r15, r8 + movq xmm0, rax + movq xmm2, rdx + xor rdx, QWORD PTR [rcx+r13] + mov rax, rdi + xor rdi, 16 + punpcklqdq xmm2, xmm0 + xor rax, 48 + and edi, 131056 + and eax, 131056 + add rbp, rdx + pxor xmm2, XMMWORD PTR [rdi+r13] + movdqu xmm0, XMMWORD PTR [rax+r13] + paddq xmm2, xmm10 + movdqu XMMWORD PTR [rdi+r13], xmm2 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rcx+r13], xmm0 + movdqa xmm10, xmm8 + movdqu XMMWORD PTR [rax+r13], xmm1 + movdqa xmm8, xmm4 + mov QWORD PTR [r10], rbp + xor rbp, r9 + mov QWORD PTR [r10+8], r15 + xor r15, r11 + sub QWORD PTR [rsp+8], 1 + jne upx2_main_loop + + ldmxcsr DWORD PTR [rsp+28] + + movaps xmm13, XMMWORD PTR [rsp+80] + lea r11, QWORD PTR [rsp+216] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+64] + movaps xmm15, XMMWORD PTR [rsp+48] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx diff --git a/src/crypto/cn/asm/cn_main_loop.S b/src/crypto/cn/asm/cn_main_loop.S index 609b0fe8..0dfd3ee2 100644 --- a/src/crypto/cn/asm/cn_main_loop.S +++ b/src/crypto/cn/asm/cn_main_loop.S @@ -17,6 +17,7 @@ .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv2_rwz_mainloop_asm) .global FN_PREFIX(cnv2_rwz_double_mainloop_asm) +.global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm) ALIGN(64) FN_PREFIX(cnv2_mainloop_ivybridge_asm): @@ -72,6 +73,15 @@ FN_PREFIX(cnv2_rwz_double_mainloop_asm): ret 0 mov eax, 3735929054 +ALIGN(64) +FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn2/cnv2_upx_double_mainloop_zen3.inc" + add rsp, 48 + ret 0 + mov eax, 3735929054 + #if defined(__linux__) && defined(__ELF__) .section .note.GNU-stack,"",%progbits #endif diff --git a/src/crypto/cn/asm/cn_main_loop.asm b/src/crypto/cn/asm/cn_main_loop.asm index f0766a7c..97ae5299 100644 --- a/src/crypto/cn/asm/cn_main_loop.asm +++ b/src/crypto/cn/asm/cn_main_loop.asm @@ -48,5 +48,12 @@ cnv2_rwz_double_mainloop_asm PROC mov eax, 3735929054 cnv2_rwz_double_mainloop_asm ENDP +ALIGN(64) +cnv2_upx_double_mainloop_zen3_asm PROC + INCLUDE cn2/cnv2_upx_double_mainloop_zen3.inc + ret 0 + mov eax, 3735929054 +cnv2_upx_double_mainloop_zen3_asm ENDP + _TEXT_CNV2_MAINLOOP ENDS END diff --git a/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc b/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc new file mode 100644 index 00000000..284956fe --- /dev/null +++ b/src/crypto/cn/asm/win64/cn2/cnv2_upx_double_mainloop_zen3.inc @@ -0,0 +1,322 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 216 + + mov rdi, QWORD PTR [rcx+8] + + mov edx, 768 + mov rbx, QWORD PTR [rcx] + mov ecx, 256 + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + mov r13, QWORD PTR [rdi+224] + movd xmm0, QWORD PTR [rdi+104] + mov r12, QWORD PTR [rbx+224] + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rax-136], xmm9 + movaps XMMWORD PTR [rax-152], xmm10 + movaps XMMWORD PTR [rsp+112], xmm11 + movaps XMMWORD PTR [rsp+96], xmm12 + movaps XMMWORD PTR [rsp+80], xmm13 + movd xmm13, QWORD PTR [rbx+96] + movaps XMMWORD PTR [rsp+64], xmm14 + movd xmm14, QWORD PTR [rbx+104] + movaps XMMWORD PTR [rsp+48], xmm15 + movd xmm15, QWORD PTR [rdi+96] + mov QWORD PTR [rsp], r13 + movdqa XMMWORD PTR [rsp+32], xmm0 + + stmxcsr DWORD PTR [rsp+24] + mov DWORD PTR [rsp+28], 24448 + ldmxcsr DWORD PTR [rsp+28] + + mov rcx, QWORD PTR [rbx+56] + xorps xmm12, xmm12 + xor rcx, QWORD PTR [rbx+24] + mov rax, QWORD PTR [rbx+48] + xor rax, QWORD PTR [rbx+16] + mov rsi, QWORD PTR [rbx+32] + mov rbp, QWORD PTR [rdi+32] + movd xmm0, rcx + + mov rcx, QWORD PTR [rbx+88] + xor rcx, QWORD PTR [rbx+72] + movd xmm7, rax + mov rax, QWORD PTR [rbx+80] + xor rax, QWORD PTR [rbx+64] + mov r14, QWORD PTR [rbx+40] + mov r15, QWORD PTR [rdi+40] + xor rsi, QWORD PTR [rbx] + xor rbp, QWORD PTR [rdi] + movd xmm9, rax + + mov rax, QWORD PTR [rdi+48] + xor rax, QWORD PTR [rdi+16] + xor r14, QWORD PTR [rbx+8] + xor r15, QWORD PTR [rdi+8] + movd xmm8, rax + punpcklqdq xmm7, xmm0 + + mov eax, 1023 + shl rax, 52 + movd xmm11, rax + punpcklqdq xmm11, xmm11 + + mov rax, QWORD PTR [rdi+80] + movd xmm0, rcx + mov rcx, QWORD PTR [rdi+56] + xor rcx, QWORD PTR [rdi+24] + punpcklqdq xmm9, xmm0 + mov QWORD PTR [rsp+8], 16384 + movd xmm0, rcx + mov rcx, QWORD PTR [rdi+88] + xor rcx, QWORD PTR [rdi+72] + xor rax, QWORD PTR [rdi+64] + punpcklqdq xmm8, xmm0 + movd xmm0, rcx + movd xmm10, rax + mov rax, 4389456576511 + mov QWORD PTR [rsp+16], rax + punpcklqdq xmm10, xmm0 + + ALIGN(64) +upx2_main_loop: + mov rdx, rsi + mov r9, rbp + and edx, 131056 + and r9d, 131056 + movdqu xmm6, XMMWORD PTR [rdx+r12] + lea r8, QWORD PTR [rdx+r12] + movdqu xmm4, XMMWORD PTR [r9+r13] + lea r10, QWORD PTR [r9+r13] + mov ecx, edx + mov eax, edx + xor rax, 32 + xor rcx, 48 + xor rdx, 16 + movd xmm0, r14 + movd xmm3, rsi + movd xmm5, rbp + punpcklqdq xmm3, xmm0 + movd xmm0, r15 + movdqu xmm2, XMMWORD PTR [rax+r12] + movdqu xmm1, XMMWORD PTR [rcx+r12] + paddq xmm2, xmm3 + punpcklqdq xmm5, xmm0 + paddq xmm1, xmm7 + aesenc xmm6, xmm3 + aesenc xmm4, xmm5 + movdqa xmm0, xmm9 + movd rdi, xmm4 + paddq xmm0, XMMWORD PTR [rdx+r12] + movdqu XMMWORD PTR [rdx+r12], xmm0 + xor edx, edx + movdqu XMMWORD PTR [rax+r12], xmm1 + movdqa xmm0, xmm6 + movdqu XMMWORD PTR [rcx+r12], xmm2 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r8], xmm0 + mov ecx, r9d + xor rcx, 48 + mov eax, r9d + xor rax, 32 + xor r9, 16 + movdqa xmm0, xmm10 + movdqu xmm1, XMMWORD PTR [rcx+r13] + movdqu xmm2, XMMWORD PTR [rax+r13] + paddq xmm1, xmm8 + paddq xmm0, XMMWORD PTR [r9+r13] + paddq xmm2, xmm5 + movdqu XMMWORD PTR [r9+r13], xmm0 + movd r9, xmm6 + movdqu XMMWORD PTR [rax+r13], xmm1 + movdqa xmm0, xmm4 + movdqu XMMWORD PTR [rcx+r13], xmm2 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [r10], xmm0 + movd rcx, xmm14 + mov rax, rcx + movd r10, xmm13 + shl rax, 32 + movdqa xmm0, xmm6 + xor r10, rax + psrldq xmm0, 8 + lea r8, QWORD PTR [rcx+rcx] + movd rax, xmm0 + add r8d, r9d + mov ecx, -2147483647 + or r8, rcx + mov r11, r9 + div r8 + and r11d, 131056 + movaps xmm1, xmm12 + mov eax, eax + add r11, r12 + shl rdx, 32 + add rdx, rax + xor r10, QWORD PTR [r11] + mov rbx, QWORD PTR [r11+8] + lea r8, QWORD PTR [rdx+r9] + movd xmm13, rdx + mov rax, r8 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm11 + sqrtsd xmm1, xmm0 + mov r13, -4389456576512 + movd rdx, xmm1 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + sub rcx, QWORD PTR [rsp+16] + mov r13, QWORD PTR [rsp] + imul rcx, rax + mov rax, r10 + sub rcx, r8 + mov rcx, r9 + adc rdx, 0 + xor rcx, 32 + and ecx, 131056 + movd xmm14, rdx + movdqu xmm1, XMMWORD PTR [rcx+r12] + mul r9 + paddq xmm1, xmm3 + mov r8, rax + xor r8, QWORD PTR [rcx+r12+8] + add r14, r8 + movd xmm0, rax + movd xmm2, rdx + xor rdx, QWORD PTR [rcx+r12] + mov rax, r9 + xor rax, 48 + punpcklqdq xmm2, xmm0 + and eax, 131056 + add rsi, rdx + xor r9, 16 + xor edx, edx + and r9d, 131056 + movdqu xmm0, XMMWORD PTR [rax+r12] + paddq xmm0, xmm7 + pxor xmm2, XMMWORD PTR [r9+r12] + paddq xmm2, xmm9 + movdqu XMMWORD PTR [r9+r12], xmm2 + movd r9, xmm15 + movdqu XMMWORD PTR [rcx+r12], xmm0 + movdqa xmm0, xmm4 + mov rcx, QWORD PTR [rsp+32] + movdqu XMMWORD PTR [rax+r12], xmm1 + mov rax, rcx + shl rax, 32 + movaps xmm1, xmm12 + xor r9, rax + psrldq xmm0, 8 + lea r8, QWORD PTR [rcx+rcx] + mov QWORD PTR [r11], rsi + add r8d, edi + mov QWORD PTR [r11+8], r14 + movd rax, xmm0 + mov ecx, -2147483647 + or r8, rcx + xor rsi, r10 + div r8 + mov r10, rdi + xor r14, rbx + mov eax, eax + and r10d, 131056 + shl rdx, 32 + add r10, r13 + add rdx, rax + xor r9, QWORD PTR [r10] + mov r11, QWORD PTR [r10+8] + lea r8, QWORD PTR [rdx+rdi] + mov rax, r8 + movd xmm15, rdx + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm11 + sqrtsd xmm1, xmm0 + movd rdx, xmm1 + mov rax, rdx + shr rax, 20 + shr rdx, 19 + mov rcx, rdx + sub rcx, rax + mov rbx, 4389456576511 + sub rcx, rbx + movdqa xmm9, xmm7 + mov rbx, -4389456576512 + movdqa xmm7, xmm6 + add rax, rbx + imul rcx, rax + mov rax, r9 + sub rcx, r8 + mov rcx, rdi + adc rdx, 0 + xor rcx, 32 + and ecx, 131056 + movd xmm0, rdx + movdqu xmm1, XMMWORD PTR [rcx+r13] + mul rdi + movdqa XMMWORD PTR [rsp+32], xmm0 + paddq xmm1, xmm5 + mov r8, rax + xor r8, QWORD PTR [rcx+r13+8] + add r15, r8 + movd xmm0, rax + movd xmm2, rdx + xor rdx, QWORD PTR [rcx+r13] + mov rax, rdi + xor rdi, 16 + punpcklqdq xmm2, xmm0 + xor rax, 48 + and edi, 131056 + and eax, 131056 + add rbp, rdx + pxor xmm2, XMMWORD PTR [rdi+r13] + movdqu xmm0, XMMWORD PTR [rax+r13] + paddq xmm2, xmm10 + movdqu XMMWORD PTR [rdi+r13], xmm2 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rcx+r13], xmm0 + movdqa xmm10, xmm8 + movdqu XMMWORD PTR [rax+r13], xmm1 + movdqa xmm8, xmm4 + mov QWORD PTR [r10], rbp + xor rbp, r9 + mov QWORD PTR [r10+8], r15 + xor r15, r11 + sub QWORD PTR [rsp+8], 1 + jne upx2_main_loop + + ldmxcsr DWORD PTR [rsp+28] + + movaps xmm13, XMMWORD PTR [rsp+80] + lea r11, QWORD PTR [rsp+216] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+64] + movaps xmm15, XMMWORD PTR [rsp+48] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx diff --git a/src/crypto/cn/asm/win64/cn_main_loop.S b/src/crypto/cn/asm/win64/cn_main_loop.S index 63c3a8ba..9361469a 100644 --- a/src/crypto/cn/asm/win64/cn_main_loop.S +++ b/src/crypto/cn/asm/win64/cn_main_loop.S @@ -7,6 +7,7 @@ .global cnv2_double_mainloop_sandybridge_asm .global cnv2_rwz_mainloop_asm .global cnv2_rwz_double_mainloop_asm +.global cnv2_upx_double_mainloop_zen3_asm ALIGN(64) cnv2_mainloop_ivybridge_asm: @@ -43,3 +44,9 @@ cnv2_rwz_double_mainloop_asm: #include "cn2/cnv2_rwz_double_main_loop.inc" ret 0 mov eax, 3735929054 + +ALIGN(64) +cnv2_upx_double_mainloop_zen3_asm: + #include "cn2/cnv2_upx_double_mainloop_zen3.inc" + ret 0 + mov eax, 3735929054 diff --git a/src/crypto/cn/asm/win64/cn_main_loop.asm b/src/crypto/cn/asm/win64/cn_main_loop.asm index 57246cf5..7f83e682 100644 --- a/src/crypto/cn/asm/win64/cn_main_loop.asm +++ b/src/crypto/cn/asm/win64/cn_main_loop.asm @@ -48,5 +48,12 @@ cnv2_rwz_double_mainloop_asm PROC mov eax, 3735929054 cnv2_rwz_double_mainloop_asm ENDP +ALIGN(64) +cnv2_upx_double_mainloop_zen3_asm PROC + INCLUDE cn2/cnv2_upx_double_mainloop_zen3.inc + ret 0 + mov eax, 3735929054 +cnv2_upx_double_mainloop_zen3_asm ENDP + _TEXT_CNV2_MAINLOOP ENDS END