Compiler fix

This commit is contained in:
SChernykh 2021-12-01 00:01:21 +01:00
parent efb322df66
commit e87d5111a2
19 changed files with 1401 additions and 279 deletions

View file

@ -0,0 +1,132 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 32
mov rdx, QWORD PTR [rcx]
mov r8, QWORD PTR [rcx+8]
mov r12d, 524288
movaps XMMWORD PTR [rsp+16], xmm6
mov rbx, QWORD PTR [rdx+32]
xor rbx, QWORD PTR [rdx]
mov rsi, QWORD PTR [rdx+40]
mov r10, rbx
xor rsi, QWORD PTR [rdx+8]
and r10d, 2097136
mov rdi, QWORD PTR [r8+32]
xor rdi, QWORD PTR [r8]
movq xmm3, rbx
mov rbp, QWORD PTR [r8+40]
mov r9, rdi
xor rbp, QWORD PTR [r8+8]
movq xmm0, rsi
mov rcx, QWORD PTR [rdx+56]
and r9d, 2097136
xor rcx, QWORD PTR [rdx+24]
movq xmm4, rdi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
mov r14, QWORD PTR [rdx+224]
mov r13, QWORD PTR [rdx+232]
mov r15, QWORD PTR [r8+224]
punpcklqdq xmm3, xmm0
movq xmm0, rbp
movq xmm5, rax
punpcklqdq xmm4, xmm0
mov rax, QWORD PTR [r8+48]
movq xmm0, rcx
xor rax, QWORD PTR [r8+16]
mov rcx, QWORD PTR [r8+56]
xor rcx, QWORD PTR [r8+24]
movdqu xmm1, XMMWORD PTR [r14+r10]
movq xmm6, rax
punpcklqdq xmm5, xmm0
mov rax, QWORD PTR [rdx+240]
movq xmm0, rcx
movdqu xmm2, XMMWORD PTR [r15+r9]
mov QWORD PTR [rsp], rax
mov rax, QWORD PTR [r8+240]
mov QWORD PTR [rsp+8], rax
punpcklqdq xmm6, xmm0
ALIGN(64)
main_loop_cnv1_double:
aesenc xmm1, xmm3
aesenc xmm2, xmm4
movdqa xmm0, xmm1
movq r11, xmm2
pxor xmm0, xmm5
movdqa xmm5, xmm1
movq QWORD PTR [r14+r10], xmm0
pextrq rcx, xmm0, 1
mov eax, ecx
movdqa xmm0, xmm2
shr rax, 24
pxor xmm0, xmm6
movdqa xmm6, xmm2
mov eax, DWORD PTR [r13+rax*4]
xor rax, rcx
mov QWORD PTR [r14+r10+8], rax
movq QWORD PTR [r15+r9], xmm0
pextrq rcx, xmm0, 1
mov eax, ecx
shr rax, 24
mov eax, DWORD PTR [r13+rax*4]
xor rax, rcx
movq rcx, xmm1
mov QWORD PTR [r15+r9+8], rax
mov r9, rcx
and r9d, 2097136
mov r10, QWORD PTR [r14+r9]
mov r8, QWORD PTR [r14+r9+8]
mov rax, r10
mul rcx
add rsi, rax
add rbx, rdx
mov rax, QWORD PTR [rsp]
mov QWORD PTR [r14+r9], rbx
xor rax, rsi
mov QWORD PTR [r14+r9+8], rax
xor rsi, r8
xor rbx, r10
mov r8, r11
and r8d, 2097136
mov r10, rbx
and r10d, 2097136
movq xmm3, rbx
pinsrq xmm3, rsi, 1
mov r9, QWORD PTR [r15+r8]
mov rcx, QWORD PTR [r15+r8+8]
mov rax, r9
movdqu xmm1, XMMWORD PTR [r14+r10]
mul r11
add rbp, rax
add rdi, rdx
mov rax, QWORD PTR [rsp+8]
mov QWORD PTR [r15+r8], rdi
xor rax, rbp
xor rdi, r9
mov QWORD PTR [r15+r8+8], rax
mov r9, rdi
xor rbp, rcx
and r9d, 2097136
movq xmm4, rdi
pinsrq xmm4, rbp, 1
movdqu xmm2, XMMWORD PTR [r15+r9]
sub r12, 1
jne main_loop_cnv1_double
mov rbx, QWORD PTR [rsp+80]
mov rbp, QWORD PTR [rsp+88]
mov rsi, QWORD PTR [rsp+96]
movaps xmm6, XMMWORD PTR [rsp+16]
add rsp, 32
pop r15
pop r14
pop r13
pop r12
pop rdi

View file

@ -0,0 +1,263 @@
mov rax, rsp
mov QWORD PTR [rax+8], rbx
mov QWORD PTR [rax+16], rbp
mov QWORD PTR [rax+24], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 144
mov r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+8]
mov r10, QWORD PTR [rcx+16]
mov r11, QWORD PTR [rcx+24]
mov rbp, QWORD PTR [r8+224]
mov r13, QWORD PTR [r8+232]
mov r14, QWORD PTR [r9+224]
mov r15, QWORD PTR [r10+224]
mov r12, QWORD PTR [r11+224]
mov rcx, QWORD PTR [r8+40]
xor rcx, QWORD PTR [r8+8]
mov rbx, QWORD PTR [r8+32]
xor rbx, QWORD PTR [r8]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
movq xmm0, rcx
mov rcx, QWORD PTR [r9+40]
xor rcx, QWORD PTR [r9+8]
movq xmm1, rbx
movaps XMMWORD PTR [rax-56], xmm6
movaps XMMWORD PTR [rax-72], xmm7
movaps XMMWORD PTR [rax-88], xmm8
movaps XMMWORD PTR [rax-104], xmm9
movaps XMMWORD PTR [rax-120], xmm10
movaps XMMWORD PTR [rsp+48], xmm11
movaps XMMWORD PTR [rsp+32], xmm12
and ebx, 2097136
mov rsi, QWORD PTR [r10+32]
movq xmm2, rdi
mov rax, QWORD PTR [r8+240]
and edi, 2097136
xor rsi, QWORD PTR [r10]
mov rdx, QWORD PTR [r8+56]
xor rdx, QWORD PTR [r8+24]
mov QWORD PTR [rsp], rax
mov rax, QWORD PTR [r9+240]
movq xmm3, rsi
mov QWORD PTR [rsp+8], rax
and esi, 2097136
mov rax, QWORD PTR [r10+240]
punpcklqdq xmm1, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r10+40]
xor rcx, QWORD PTR [r10+8]
mov QWORD PTR [rsp+16], rax
mov rax, QWORD PTR [r11+240]
punpcklqdq xmm2, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+24], rax
mov rcx, QWORD PTR [r11+40]
xor rcx, QWORD PTR [r11+8]
mov rax, QWORD PTR [r11+32]
xor rax, QWORD PTR [r11]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r8+48]
xor rcx, QWORD PTR [r8+16]
movq xmm4, rax
and eax, 2097136
punpcklqdq xmm4, xmm0
movq xmm0, rdx
mov rdx, QWORD PTR [r9+56]
xor rdx, QWORD PTR [r9+24]
movq xmm5, rcx
mov rcx, QWORD PTR [r9+48]
xor rcx, QWORD PTR [r9+16]
punpcklqdq xmm5, xmm0
movq xmm0, rdx
mov rdx, QWORD PTR [r10+56]
xor rdx, QWORD PTR [r10+24]
movq xmm6, rcx
mov rcx, QWORD PTR [r10+48]
xor rcx, QWORD PTR [r10+16]
punpcklqdq xmm6, xmm0
movq xmm0, rdx
mov rdx, QWORD PTR [r11+56]
movq xmm7, rcx
punpcklqdq xmm7, xmm0
xor rdx, QWORD PTR [r11+24]
mov rcx, QWORD PTR [r11+48]
xor rcx, QWORD PTR [r11+16]
mov r11d, 524288
movdqu xmm9, XMMWORD PTR [rbp+rbx]
movdqu xmm10, XMMWORD PTR [r14+rdi]
movq xmm0, rdx
movdqu xmm11, XMMWORD PTR [r15+rsi]
movdqu xmm12, XMMWORD PTR [r12+rax]
movq xmm8, rcx
punpcklqdq xmm8, xmm0
ALIGN(64)
main_loop_cnv1_quad:
aesenc xmm9, xmm1
aesenc xmm10, xmm2
aesenc xmm11, xmm3
aesenc xmm12, xmm4
movd ecx, xmm9
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+rbp]
movd ecx, xmm10
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+r14]
movd ecx, xmm11
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+r15]
movd ecx, xmm12
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+r12]
movdqa xmm0, xmm9
pxor xmm0, xmm5
movdqa xmm5, xmm9
movq QWORD PTR [rbp+rbx], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
movdqa xmm0, xmm10
shr rcx, 24
pxor xmm0, xmm6
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [rbp+rbx+8], rcx
movq rbx, xmm1
movq QWORD PTR [r14+rdi], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
movdqa xmm0, xmm11
shr rcx, 24
pxor xmm0, xmm7
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [r14+rdi+8], rcx
movq rdi, xmm2
movq QWORD PTR [r15+rsi], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
movdqa xmm0, xmm12
shr rcx, 24
pxor xmm0, xmm8
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [r15+rsi+8], rcx
movq QWORD PTR [r12+rax], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
shr rcx, 24
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [r12+rax+8], rcx
movq rcx, xmm9
mov r8, rcx
and r8d, 2097136
mov r9, QWORD PTR [rbp+r8]
mov r10, QWORD PTR [rbp+r8+8]
mov rax, r9
mul rcx
pextrq rcx, xmm1, 1
add rcx, rax
add rbx, rdx
mov rax, QWORD PTR [rsp]
mov QWORD PTR [rbp+r8], rbx
xor rax, rcx
mov QWORD PTR [rbp+r8+8], rax
xor rcx, r10
xor rbx, r9
movq xmm1, rbx
and ebx, 2097136
pinsrq xmm1, rcx, 1
movq rcx, xmm10
mov r8, rcx
and r8d, 2097136
movdqu xmm9, XMMWORD PTR [rbp+rbx]
mov r9, QWORD PTR [r14+r8]
mov r10, QWORD PTR [r14+r8+8]
mov rax, r9
mul rcx
pextrq rcx, xmm2, 1
add rcx, rax
add rdi, rdx
mov rax, QWORD PTR [rsp+8]
mov QWORD PTR [r14+r8], rdi
xor rax, rcx
xor rdi, r9
mov QWORD PTR [r14+r8+8], rax
xor rcx, r10
movq xmm2, rdi
and edi, 2097136
pinsrq xmm2, rcx, 1
movq rcx, xmm11
movq rsi, xmm3
mov r8, rcx
and r8d, 2097136
movdqa xmm6, xmm10
movdqa xmm7, xmm11
movdqa xmm8, xmm12
movdqu xmm10, XMMWORD PTR [r14+rdi]
mov r9, QWORD PTR [r15+r8]
mov r10, QWORD PTR [r15+r8+8]
mov rax, r9
mul rcx
pextrq rcx, xmm3, 1
add rcx, rax
add rsi, rdx
mov rax, QWORD PTR [rsp+16]
xor rax, rcx
mov QWORD PTR [r15+r8], rsi
mov QWORD PTR [r15+r8+8], rax
xor rcx, r10
xor rsi, r9
movq xmm3, rsi
and esi, 2097136
pinsrq xmm3, rcx, 1
movq rcx, xmm12
mov r8, rcx
and r8d, 2097136
movdqu xmm11, XMMWORD PTR [r15+rsi]
mov r9, QWORD PTR [r12+r8]
mov r10, QWORD PTR [r12+r8+8]
mov rax, r9
mul rcx
mov rcx, rax
movq rax, xmm4
add rax, rdx
mov QWORD PTR [r12+r8], rax
xor rax, r9
pextrq rdx, xmm4, 1
add rdx, rcx
mov rcx, QWORD PTR [rsp+24]
xor rcx, rdx
xor rdx, r10
movq xmm4, rax
mov QWORD PTR [r12+r8+8], rcx
and eax, 2097136
pinsrq xmm4, rdx, 1
movdqu xmm12, XMMWORD PTR [r12+rax]
sub r11, 1
jne main_loop_cnv1_quad
movaps xmm7, XMMWORD PTR [rsp+112]
lea r11, QWORD PTR [rsp+144]
mov rbx, QWORD PTR [r11+48]
mov rbp, QWORD PTR [r11+56]
mov rsi, QWORD PTR [r11+64]
movaps xmm6, XMMWORD PTR [r11-16]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm9, XMMWORD PTR [r11-64]
movaps xmm10, XMMWORD PTR [r11-80]
movaps xmm11, XMMWORD PTR [r11-96]
movaps xmm12, XMMWORD PTR [r11-112]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi

View file

@ -0,0 +1,66 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r13
push r14
push r15
mov rdx, QWORD PTR [rcx]
mov esi, 524288
mov r11, QWORD PTR [rdx+32]
xor r11, QWORD PTR [rdx]
mov rdi, QWORD PTR [rdx+224]
mov rbx, QWORD PTR [rdx+40]
xor rbx, QWORD PTR [rdx+8]
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
mov rbp, QWORD PTR [rdx+240]
mov r14, QWORD PTR [rdx+232]
movq xmm2, rax
pinsrq xmm2, rcx, 1
ALIGN(64)
main_loop_cnv1_single:
mov r8, r11
and r8d, 2097136
movdqu xmm1, XMMWORD PTR [rdi+r8]
movq xmm0, r11
pinsrq xmm0, rbx, 1
aesenc xmm1, xmm0
movq r15, xmm1
mov r9, r15
and r9d, 2097136
movdqa xmm0, xmm1
pxor xmm0, xmm2
movdqa xmm2, xmm1
movq QWORD PTR [rdi+r8], xmm0
pextrq rdx, xmm0, 1
mov eax, edx
shr rax, 24
mov ecx, DWORD PTR [r14+rax*4]
xor rcx, rdx
mov QWORD PTR [rdi+r8+8], rcx
mov r10, QWORD PTR [rdi+r9]
mov r8, QWORD PTR [rdi+r9+8]
mov rax, r10
mul r15
add rbx, rax
add r11, rdx
mov QWORD PTR [rdi+r9], r11
mov rax, rbx
xor rbx, r8
xor r11, r10
xor rax, rbp
mov QWORD PTR [rdi+r9+8], rax
sub rsi, 1
jne main_loop_cnv1_single
pop r15
pop r14
pop r13
mov rbx, QWORD PTR [rsp+8]
mov rbp, QWORD PTR [rsp+16]
mov rsi, QWORD PTR [rsp+24]
mov rdi, QWORD PTR [rsp+32]

View file

@ -11,6 +11,9 @@
# define FN_PREFIX(fn) fn
.section .text
#endif
.global FN_PREFIX(cnv1_single_mainloop_asm)
.global FN_PREFIX(cnv1_double_mainloop_asm)
.global FN_PREFIX(cnv1_quad_mainloop_asm)
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
@ -19,6 +22,33 @@
.global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
.global FN_PREFIX(cnv2_upx_double_mainloop_zen3_asm)
ALIGN(64)
FN_PREFIX(cnv1_single_mainloop_asm):
sub rsp, 48
mov rcx, rdi
#include "cn1/cnv1_single_main_loop.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv1_double_mainloop_asm):
sub rsp, 48
mov rcx, rdi
#include "cn1/cnv1_double_main_loop.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv1_quad_mainloop_asm):
sub rsp, 48
mov rcx, rdi
#include "cn1/cnv1_quad_main_loop.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
sub rsp, 48

View file

@ -1,4 +1,7 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv1_single_mainloop_asm
PUBLIC cnv1_double_mainloop_asm
PUBLIC cnv1_quad_mainloop_asm
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
@ -6,6 +9,27 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm
PUBLIC cnv2_rwz_mainloop_asm
PUBLIC cnv2_rwz_double_mainloop_asm
ALIGN(64)
cnv1_single_mainloop_asm PROC
INCLUDE cn1/cnv1_single_main_loop.inc
ret 0
mov eax, 3735929054
cnv1_single_mainloop_asm ENDP
ALIGN(64)
cnv1_double_mainloop_asm PROC
INCLUDE cn1/cnv1_double_main_loop.inc
ret 0
mov eax, 3735929054
cnv1_double_mainloop_asm ENDP
ALIGN(64)
cnv1_quad_mainloop_asm PROC
INCLUDE cn1/cnv1_quad_main_loop.inc
ret 0
mov eax, 3735929054
cnv1_quad_mainloop_asm ENDP
ALIGN(64)
cnv2_mainloop_ivybridge_asm PROC
INCLUDE cn2/cnv2_main_loop_ivybridge.inc

View file

@ -0,0 +1,132 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 32
mov rdx, QWORD PTR [rcx]
mov r8, QWORD PTR [rcx+8]
mov r12d, 524288
movaps XMMWORD PTR [rsp+16], xmm6
mov rbx, QWORD PTR [rdx+32]
xor rbx, QWORD PTR [rdx]
mov rsi, QWORD PTR [rdx+40]
mov r10, rbx
xor rsi, QWORD PTR [rdx+8]
and r10d, 2097136
mov rdi, QWORD PTR [r8+32]
xor rdi, QWORD PTR [r8]
movd xmm3, rbx
mov rbp, QWORD PTR [r8+40]
mov r9, rdi
xor rbp, QWORD PTR [r8+8]
movd xmm0, rsi
mov rcx, QWORD PTR [rdx+56]
and r9d, 2097136
xor rcx, QWORD PTR [rdx+24]
movd xmm4, rdi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
mov r14, QWORD PTR [rdx+224]
mov r13, QWORD PTR [rdx+232]
mov r15, QWORD PTR [r8+224]
punpcklqdq xmm3, xmm0
movd xmm0, rbp
movd xmm5, rax
punpcklqdq xmm4, xmm0
mov rax, QWORD PTR [r8+48]
movd xmm0, rcx
xor rax, QWORD PTR [r8+16]
mov rcx, QWORD PTR [r8+56]
xor rcx, QWORD PTR [r8+24]
movdqu xmm1, XMMWORD PTR [r14+r10]
movd xmm6, rax
punpcklqdq xmm5, xmm0
mov rax, QWORD PTR [rdx+240]
movd xmm0, rcx
movdqu xmm2, XMMWORD PTR [r15+r9]
mov QWORD PTR [rsp], rax
mov rax, QWORD PTR [r8+240]
mov QWORD PTR [rsp+8], rax
punpcklqdq xmm6, xmm0
ALIGN(64)
main_loop_cnv1_double:
aesenc xmm1, xmm3
aesenc xmm2, xmm4
movdqa xmm0, xmm1
movd r11, xmm2
pxor xmm0, xmm5
movdqa xmm5, xmm1
movd QWORD PTR [r14+r10], xmm0
pextrq rcx, xmm0, 1
mov eax, ecx
movdqa xmm0, xmm2
shr rax, 24
pxor xmm0, xmm6
movdqa xmm6, xmm2
mov eax, DWORD PTR [r13+rax*4]
xor rax, rcx
mov QWORD PTR [r14+r10+8], rax
movd QWORD PTR [r15+r9], xmm0
pextrq rcx, xmm0, 1
mov eax, ecx
shr rax, 24
mov eax, DWORD PTR [r13+rax*4]
xor rax, rcx
movd rcx, xmm1
mov QWORD PTR [r15+r9+8], rax
mov r9, rcx
and r9d, 2097136
mov r10, QWORD PTR [r14+r9]
mov r8, QWORD PTR [r14+r9+8]
mov rax, r10
mul rcx
add rsi, rax
add rbx, rdx
mov rax, QWORD PTR [rsp]
mov QWORD PTR [r14+r9], rbx
xor rax, rsi
mov QWORD PTR [r14+r9+8], rax
xor rsi, r8
xor rbx, r10
mov r8, r11
and r8d, 2097136
mov r10, rbx
and r10d, 2097136
movd xmm3, rbx
pinsrq xmm3, rsi, 1
mov r9, QWORD PTR [r15+r8]
mov rcx, QWORD PTR [r15+r8+8]
mov rax, r9
movdqu xmm1, XMMWORD PTR [r14+r10]
mul r11
add rbp, rax
add rdi, rdx
mov rax, QWORD PTR [rsp+8]
mov QWORD PTR [r15+r8], rdi
xor rax, rbp
xor rdi, r9
mov QWORD PTR [r15+r8+8], rax
mov r9, rdi
xor rbp, rcx
and r9d, 2097136
movd xmm4, rdi
pinsrq xmm4, rbp, 1
movdqu xmm2, XMMWORD PTR [r15+r9]
sub r12, 1
jne main_loop_cnv1_double
mov rbx, QWORD PTR [rsp+80]
mov rbp, QWORD PTR [rsp+88]
mov rsi, QWORD PTR [rsp+96]
movaps xmm6, XMMWORD PTR [rsp+16]
add rsp, 32
pop r15
pop r14
pop r13
pop r12
pop rdi

View file

@ -0,0 +1,263 @@
mov rax, rsp
mov QWORD PTR [rax+8], rbx
mov QWORD PTR [rax+16], rbp
mov QWORD PTR [rax+24], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 144
mov r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+8]
mov r10, QWORD PTR [rcx+16]
mov r11, QWORD PTR [rcx+24]
mov rbp, QWORD PTR [r8+224]
mov r13, QWORD PTR [r8+232]
mov r14, QWORD PTR [r9+224]
mov r15, QWORD PTR [r10+224]
mov r12, QWORD PTR [r11+224]
mov rcx, QWORD PTR [r8+40]
xor rcx, QWORD PTR [r8+8]
mov rbx, QWORD PTR [r8+32]
xor rbx, QWORD PTR [r8]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
movd xmm0, rcx
mov rcx, QWORD PTR [r9+40]
xor rcx, QWORD PTR [r9+8]
movd xmm1, rbx
movaps XMMWORD PTR [rax-56], xmm6
movaps XMMWORD PTR [rax-72], xmm7
movaps XMMWORD PTR [rax-88], xmm8
movaps XMMWORD PTR [rax-104], xmm9
movaps XMMWORD PTR [rax-120], xmm10
movaps XMMWORD PTR [rsp+48], xmm11
movaps XMMWORD PTR [rsp+32], xmm12
and ebx, 2097136
mov rsi, QWORD PTR [r10+32]
movd xmm2, rdi
mov rax, QWORD PTR [r8+240]
and edi, 2097136
xor rsi, QWORD PTR [r10]
mov rdx, QWORD PTR [r8+56]
xor rdx, QWORD PTR [r8+24]
mov QWORD PTR [rsp], rax
mov rax, QWORD PTR [r9+240]
movd xmm3, rsi
mov QWORD PTR [rsp+8], rax
and esi, 2097136
mov rax, QWORD PTR [r10+240]
punpcklqdq xmm1, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [r10+40]
xor rcx, QWORD PTR [r10+8]
mov QWORD PTR [rsp+16], rax
mov rax, QWORD PTR [r11+240]
punpcklqdq xmm2, xmm0
movd xmm0, rcx
mov QWORD PTR [rsp+24], rax
mov rcx, QWORD PTR [r11+40]
xor rcx, QWORD PTR [r11+8]
mov rax, QWORD PTR [r11+32]
xor rax, QWORD PTR [r11]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [r8+48]
xor rcx, QWORD PTR [r8+16]
movd xmm4, rax
and eax, 2097136
punpcklqdq xmm4, xmm0
movd xmm0, rdx
mov rdx, QWORD PTR [r9+56]
xor rdx, QWORD PTR [r9+24]
movd xmm5, rcx
mov rcx, QWORD PTR [r9+48]
xor rcx, QWORD PTR [r9+16]
punpcklqdq xmm5, xmm0
movd xmm0, rdx
mov rdx, QWORD PTR [r10+56]
xor rdx, QWORD PTR [r10+24]
movd xmm6, rcx
mov rcx, QWORD PTR [r10+48]
xor rcx, QWORD PTR [r10+16]
punpcklqdq xmm6, xmm0
movd xmm0, rdx
mov rdx, QWORD PTR [r11+56]
movd xmm7, rcx
punpcklqdq xmm7, xmm0
xor rdx, QWORD PTR [r11+24]
mov rcx, QWORD PTR [r11+48]
xor rcx, QWORD PTR [r11+16]
mov r11d, 524288
movdqu xmm9, XMMWORD PTR [rbp+rbx]
movdqu xmm10, XMMWORD PTR [r14+rdi]
movd xmm0, rdx
movdqu xmm11, XMMWORD PTR [r15+rsi]
movdqu xmm12, XMMWORD PTR [r12+rax]
movd xmm8, rcx
punpcklqdq xmm8, xmm0
ALIGN(64)
main_loop_cnv1_quad:
aesenc xmm9, xmm1
aesenc xmm10, xmm2
aesenc xmm11, xmm3
aesenc xmm12, xmm4
movd ecx, xmm9
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+rbp]
movd ecx, xmm10
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+r14]
movd ecx, xmm11
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+r15]
movd ecx, xmm12
and ecx, 2097136
prefetcht0 BYTE PTR [rcx+r12]
movdqa xmm0, xmm9
pxor xmm0, xmm5
movdqa xmm5, xmm9
movd QWORD PTR [rbp+rbx], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
movdqa xmm0, xmm10
shr rcx, 24
pxor xmm0, xmm6
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [rbp+rbx+8], rcx
movd rbx, xmm1
movd QWORD PTR [r14+rdi], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
movdqa xmm0, xmm11
shr rcx, 24
pxor xmm0, xmm7
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [r14+rdi+8], rcx
movd rdi, xmm2
movd QWORD PTR [r15+rsi], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
movdqa xmm0, xmm12
shr rcx, 24
pxor xmm0, xmm8
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [r15+rsi+8], rcx
movd QWORD PTR [r12+rax], xmm0
pextrq rdx, xmm0, 1
mov ecx, edx
shr rcx, 24
mov ecx, DWORD PTR [r13+rcx*4]
xor rcx, rdx
mov QWORD PTR [r12+rax+8], rcx
movd rcx, xmm9
mov r8, rcx
and r8d, 2097136
mov r9, QWORD PTR [rbp+r8]
mov r10, QWORD PTR [rbp+r8+8]
mov rax, r9
mul rcx
pextrq rcx, xmm1, 1
add rcx, rax
add rbx, rdx
mov rax, QWORD PTR [rsp]
mov QWORD PTR [rbp+r8], rbx
xor rax, rcx
mov QWORD PTR [rbp+r8+8], rax
xor rcx, r10
xor rbx, r9
movd xmm1, rbx
and ebx, 2097136
pinsrq xmm1, rcx, 1
movd rcx, xmm10
mov r8, rcx
and r8d, 2097136
movdqu xmm9, XMMWORD PTR [rbp+rbx]
mov r9, QWORD PTR [r14+r8]
mov r10, QWORD PTR [r14+r8+8]
mov rax, r9
mul rcx
pextrq rcx, xmm2, 1
add rcx, rax
add rdi, rdx
mov rax, QWORD PTR [rsp+8]
mov QWORD PTR [r14+r8], rdi
xor rax, rcx
xor rdi, r9
mov QWORD PTR [r14+r8+8], rax
xor rcx, r10
movd xmm2, rdi
and edi, 2097136
pinsrq xmm2, rcx, 1
movd rcx, xmm11
movd rsi, xmm3
mov r8, rcx
and r8d, 2097136
movdqa xmm6, xmm10
movdqa xmm7, xmm11
movdqa xmm8, xmm12
movdqu xmm10, XMMWORD PTR [r14+rdi]
mov r9, QWORD PTR [r15+r8]
mov r10, QWORD PTR [r15+r8+8]
mov rax, r9
mul rcx
pextrq rcx, xmm3, 1
add rcx, rax
add rsi, rdx
mov rax, QWORD PTR [rsp+16]
xor rax, rcx
mov QWORD PTR [r15+r8], rsi
mov QWORD PTR [r15+r8+8], rax
xor rcx, r10
xor rsi, r9
movd xmm3, rsi
and esi, 2097136
pinsrq xmm3, rcx, 1
movd rcx, xmm12
mov r8, rcx
and r8d, 2097136
movdqu xmm11, XMMWORD PTR [r15+rsi]
mov r9, QWORD PTR [r12+r8]
mov r10, QWORD PTR [r12+r8+8]
mov rax, r9
mul rcx
mov rcx, rax
movd rax, xmm4
add rax, rdx
mov QWORD PTR [r12+r8], rax
xor rax, r9
pextrq rdx, xmm4, 1
add rdx, rcx
mov rcx, QWORD PTR [rsp+24]
xor rcx, rdx
xor rdx, r10
movd xmm4, rax
mov QWORD PTR [r12+r8+8], rcx
and eax, 2097136
pinsrq xmm4, rdx, 1
movdqu xmm12, XMMWORD PTR [r12+rax]
sub r11, 1
jne main_loop_cnv1_quad
movaps xmm7, XMMWORD PTR [rsp+112]
lea r11, QWORD PTR [rsp+144]
mov rbx, QWORD PTR [r11+48]
mov rbp, QWORD PTR [r11+56]
mov rsi, QWORD PTR [r11+64]
movaps xmm6, XMMWORD PTR [r11-16]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm9, XMMWORD PTR [r11-64]
movaps xmm10, XMMWORD PTR [r11-80]
movaps xmm11, XMMWORD PTR [r11-96]
movaps xmm12, XMMWORD PTR [r11-112]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi

View file

@ -0,0 +1,66 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r13
push r14
push r15
mov rdx, QWORD PTR [rcx]
mov esi, 524288
mov r11, QWORD PTR [rdx+32]
xor r11, QWORD PTR [rdx]
mov rdi, QWORD PTR [rdx+224]
mov rbx, QWORD PTR [rdx+40]
xor rbx, QWORD PTR [rdx+8]
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
mov rbp, QWORD PTR [rdx+240]
mov r14, QWORD PTR [rdx+232]
movd xmm2, rax
pinsrq xmm2, rcx, 1
ALIGN(64)
main_loop_cnv1_single:
mov r8, r11
and r8d, 2097136
movdqu xmm1, XMMWORD PTR [rdi+r8]
movd xmm0, r11
pinsrq xmm0, rbx, 1
aesenc xmm1, xmm0
movd r15, xmm1
mov r9, r15
and r9d, 2097136
movdqa xmm0, xmm1
pxor xmm0, xmm2
movdqa xmm2, xmm1
movd QWORD PTR [rdi+r8], xmm0
pextrq rdx, xmm0, 1
mov eax, edx
shr rax, 24
mov ecx, DWORD PTR [r14+rax*4]
xor rcx, rdx
mov QWORD PTR [rdi+r8+8], rcx
mov r10, QWORD PTR [rdi+r9]
mov r8, QWORD PTR [rdi+r9+8]
mov rax, r10
mul r15
add rbx, rax
add r11, rdx
mov QWORD PTR [rdi+r9], r11
mov rax, rbx
xor rbx, r8
xor r11, r10
xor rax, rbp
mov QWORD PTR [rdi+r9+8], rax
sub rsi, 1
jne main_loop_cnv1_single
pop r15
pop r14
pop r13
mov rbx, QWORD PTR [rsp+8]
mov rbp, QWORD PTR [rsp+16]
mov rsi, QWORD PTR [rsp+24]
mov rdi, QWORD PTR [rsp+32]

View file

@ -1,6 +1,9 @@
#define ALIGN(x) .align 64
.intel_syntax noprefix
.section .text
.global cnv1_single_mainloop_asm
.global cnv1_double_mainloop_asm
.global cnv1_quad_mainloop_asm
.global cnv2_mainloop_ivybridge_asm
.global cnv2_mainloop_ryzen_asm
.global cnv2_mainloop_bulldozer_asm
@ -9,6 +12,24 @@
.global cnv2_rwz_double_mainloop_asm
.global cnv2_upx_double_mainloop_zen3_asm
ALIGN(64)
cnv1_single_mainloop_asm:
#include "../cn1/cnv1_single_main_loop.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv1_double_mainloop_asm:
#include "../cn1/cnv1_double_main_loop.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv1_quad_mainloop_asm:
#include "../cn1/cnv1_quad_main_loop.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv2_mainloop_ivybridge_asm:
#include "../cn2/cnv2_main_loop_ivybridge.inc"

View file

@ -1,4 +1,7 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv1_single_mainloop_asm
PUBLIC cnv1_double_mainloop_asm
PUBLIC cnv1_quad_mainloop_asm
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
@ -6,28 +9,49 @@ PUBLIC cnv2_double_mainloop_sandybridge_asm
PUBLIC cnv2_rwz_mainloop_asm
PUBLIC cnv2_rwz_double_mainloop_asm
ALIGN 64
ALIGN(64)
cnv1_single_mainloop_asm PROC
INCLUDE cn1/cnv1_single_main_loop.inc
ret 0
mov eax, 3735929054
cnv1_single_mainloop_asm ENDP
ALIGN(64)
cnv1_double_mainloop_asm PROC
INCLUDE cn1/cnv1_double_main_loop.inc
ret 0
mov eax, 3735929054
cnv1_double_mainloop_asm ENDP
ALIGN(64)
cnv1_quad_mainloop_asm PROC
INCLUDE cn1/cnv1_quad_main_loop.inc
ret 0
mov eax, 3735929054
cnv1_quad_mainloop_asm ENDP
ALIGN(64)
cnv2_mainloop_ivybridge_asm PROC
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ivybridge_asm ENDP
ALIGN 64
ALIGN(64)
cnv2_mainloop_ryzen_asm PROC
INCLUDE cn2/cnv2_main_loop_ryzen.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ryzen_asm ENDP
ALIGN 64
ALIGN(64)
cnv2_mainloop_bulldozer_asm PROC
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_bulldozer_asm ENDP
ALIGN 64
ALIGN(64)
cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
ret 0