diff --git a/src/crypto/asm/win/cnv1_main_loop_fast_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_fast_sandybridge.inc deleted file mode 100644 index e0fc8f14..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_fast_sandybridge.inc +++ /dev/null @@ -1,70 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 262144 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 2097136 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - ALIGN 64 -cnv1_main_loop_fast_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 2097136 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 2097136 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cnv1_main_loop_fast_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/win/cnv1_main_loop_fast_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_fast_soft_aes_sandybridge.inc deleted file mode 100644 index 9e09a83d..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_fast_soft_aes_sandybridge.inc +++ /dev/null @@ -1,162 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 2097136 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 262144 - - ALIGN 64 -cnv1_main_loop_fast_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 2097136 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 2097136 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cnv1_main_loop_fast_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/win/cnv1_main_loop_lite_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_lite_sandybridge.inc deleted file mode 100644 index 4f369c7a..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_lite_sandybridge.inc +++ /dev/null @@ -1,70 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 262144 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 1048560 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - ALIGN 64 -cnv1_main_loop_lite_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 1048560 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 1048560 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cnv1_main_loop_lite_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/win/cnv1_main_loop_lite_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_lite_soft_aes_sandybridge.inc deleted file mode 100644 index 7dc9ac8a..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_lite_soft_aes_sandybridge.inc +++ /dev/null @@ -1,162 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 1048560 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 262144 - - ALIGN 64 -cnv1_main_loop_lite_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 1048560 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 1048560 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cnv1_main_loop_lite_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/win/cnv1_main_loop_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_sandybridge.inc deleted file mode 100644 index e48183da..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_sandybridge.inc +++ /dev/null @@ -1,70 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 524288 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 2097136 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - ALIGN 64 -cnv1_main_loop_original_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 2097136 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 2097136 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cnv1_main_loop_original_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc deleted file mode 100644 index 5fd5aecb..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,162 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 2097136 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 524288 - - ALIGN 64 -cnv1_main_loop_original_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 2097136 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 2097136 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cnv1_main_loop_original_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/win/cnv1_main_loop_upx_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_upx_sandybridge.inc deleted file mode 100644 index b8beee2a..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_upx_sandybridge.inc +++ /dev/null @@ -1,70 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 131072 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 1048560 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - ALIGN 64 -cnv1_main_loop_upx_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 1048560 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 1048560 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cnv1_main_loop_upx_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/win/cnv1_main_loop_upx_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_upx_soft_aes_sandybridge.inc deleted file mode 100644 index 13351f22..00000000 --- a/src/crypto/asm/win/cnv1_main_loop_upx_soft_aes_sandybridge.inc +++ /dev/null @@ -1,162 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 1048560 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 131072 - - ALIGN 64 -cnv1_main_loop_upx_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 1048560 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 1048560 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cnv1_main_loop_upx_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/win/cnv2_double_main_loop_fastv2_sandybridge.inc b/src/crypto/asm/win/cnv2_double_main_loop_fastv2_sandybridge.inc deleted file mode 100644 index 8295ef06..00000000 --- a/src/crypto/asm/win/cnv2_double_main_loop_fastv2_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 262144 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 64 -cnv2_double_main_loop_fastv2_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_fastv2_sandybridge -div_fix_1_ret_fastv2_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_fastv2_sandybridge -div_fix_2_ret_fastv2_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_fastv2_sandybridge -sqrt_fix_1_ret_fastv2_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_fastv2_sandybridge -sqrt_fix_2_ret_fastv2_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne cnv2_double_main_loop_fastv2_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_main_loop_fastv2_sandybridge_endp - -div_fix_1_fastv2_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_fastv2_sandybridge - -div_fix_2_fastv2_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_fastv2_sandybridge - -sqrt_fix_1_fastv2_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_fastv2_sandybridge - -sqrt_fix_2_fastv2_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_fastv2_sandybridge - -cnv2_double_main_loop_fastv2_sandybridge_endp: diff --git a/src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc deleted file mode 100644 index da2c1b9e..00000000 --- a/src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 524288 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 64 -cnv2_double_main_loop_originalv2_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_originalv2_sandybridge -div_fix_1_ret_originalv2_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_originalv2_sandybridge -div_fix_2_ret_originalv2_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_originalv2_sandybridge -sqrt_fix_1_ret_originalv2_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_originalv2_sandybridge -sqrt_fix_2_ret_originalv2_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne cnv2_double_main_loop_originalv2_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_main_loop_originalv2_sandybridge_endp - -div_fix_1_originalv2_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_originalv2_sandybridge - -div_fix_2_originalv2_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_originalv2_sandybridge - -sqrt_fix_1_originalv2_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_originalv2_sandybridge - -sqrt_fix_2_originalv2_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_originalv2_sandybridge - -cnv2_double_main_loop_originalv2_sandybridge_endp: diff --git a/src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc b/src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc deleted file mode 100644 index e7c0eb60..00000000 --- a/src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 65536 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 131056 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 131056 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 64 -cnv2_double_main_loop_ultralite_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 131056 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 131056 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 131056 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_ultralite_sandybridge -div_fix_1_ret_ultralite_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_ultralite_sandybridge -div_fix_2_ret_ultralite_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_ultralite_sandybridge -sqrt_fix_1_ret_ultralite_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_ultralite_sandybridge -sqrt_fix_2_ret_ultralite_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 131056 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne cnv2_double_main_loop_ultralite_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_main_loop_ultralite_sandybridge_endp - -div_fix_1_ultralite_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_ultralite_sandybridge - -div_fix_2_ultralite_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_ultralite_sandybridge - -sqrt_fix_1_ultralite_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_ultralite_sandybridge - -sqrt_fix_2_ultralite_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_ultralite_sandybridge - -cnv2_double_main_loop_ultralite_sandybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc deleted file mode 100644 index 6bae7a3c..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 524288 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_originalv2_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movd r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movd xmm0, rax - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_originalv2_bulldozer - shr rdi, 19 - -sqrt_fixup_originalv2_bulldozer_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_originalv2_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_originalv2_bulldozer_endp - -sqrt_fixup_originalv2_bulldozer: - movd r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_originalv2_bulldozer_ret - -cnv2_main_loop_originalv2_bulldozer_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_fastv2_bulldozer.inc b/src/crypto/asm/win/cnv2_main_loop_fastv2_bulldozer.inc deleted file mode 100644 index 3915c632..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_fastv2_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_fastv2_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movd r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movd xmm0, rax - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_fastv2_bulldozer - shr rdi, 19 - -sqrt_fixup_fastv2_bulldozer_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_fastv2_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_fastv2_bulldozer_endp - -sqrt_fixup_fastv2_bulldozer: - movd r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_fastv2_bulldozer_ret - -cnv2_main_loop_fastv2_bulldozer_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_fastv2_ivybridge.inc b/src/crypto/asm/win/cnv2_main_loop_fastv2_ivybridge.inc deleted file mode 100644 index 0d0bc4fe..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_fastv2_ivybridge.inc +++ /dev/null @@ -1,182 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 262144 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 64 -cnv2_main_loop_fastv2_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - movq rdx, xmm3 - test edx, 524287 - je sqrt_fixup_fastv2_ivybridge - psrlq xmm3, 19 -sqrt_fixup_fastv2_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne cnv2_main_loop_fastv2_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp cnv2_main_loop_fastv2_ivybridge_endp - -sqrt_fixup_fastv2_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp sqrt_fixup_fastv2_ivybridge_ret - -cnv2_main_loop_fastv2_ivybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_fastv2_ryzen.inc b/src/crypto/asm/win/cnv2_main_loop_fastv2_ryzen.inc deleted file mode 100644 index 33fc7804..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_fastv2_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 64 -cnv2_main_loop_fastv2_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_fastv2_ryzen - shr rdi, 19 - -sqrt_fixup_fastv2_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_fastv2_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_fastv2_ryzen_endp - -sqrt_fixup_fastv2_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_fastv2_ryzen_ret - -cnv2_main_loop_fastv2_ryzen_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc deleted file mode 100644 index 06a899c0..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc +++ /dev/null @@ -1,267 +0,0 @@ - mov QWORD PTR [rsp+8], rcx - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 152 - - stmxcsr DWORD PTR [rsp+4] - mov DWORD PTR [rsp], 24448 - ldmxcsr DWORD PTR [rsp] - - mov rax, QWORD PTR [rcx+48] - mov r10, rcx - xor rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r9, QWORD PTR [rcx+40] - xor r9, QWORD PTR [rcx+8] - movq xmm4, rax - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r11, QWORD PTR [rcx+224] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r10+72] - mov rax, QWORD PTR [r10+80] - movq xmm0, rdx - xor rax, QWORD PTR [r10+64] - - movaps XMMWORD PTR [rsp+16], xmm6 - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+48], xmm8 - movaps XMMWORD PTR [rsp+64], xmm9 - movaps XMMWORD PTR [rsp+80], xmm10 - movaps XMMWORD PTR [rsp+96], xmm11 - movaps XMMWORD PTR [rsp+112], xmm12 - movaps XMMWORD PTR [rsp+128], xmm13 - - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - - mov rax, r8 - punpcklqdq xmm4, xmm0 - and eax, 2097136 - movq xmm10, QWORD PTR [r10+96] - movq xmm0, rcx - mov rcx, QWORD PTR [r10+104] - xorps xmm9, xmm9 - mov QWORD PTR [rsp+248], rax - movq xmm12, r11 - mov QWORD PTR [rsp+240], r9 - punpcklqdq xmm5, xmm0 - movq xmm13, rcx - mov r12d, 262144 - - ALIGN 64 -cnv2_main_loop_fastv2_soft_aes_sandybridge: - movd xmm11, r12d - mov r12, QWORD PTR [r10+272] - lea r13, QWORD PTR [rax+r11] - mov esi, DWORD PTR [r13] - movq xmm0, r9 - mov r10d, DWORD PTR [r13+4] - movq xmm7, r8 - mov ebp, DWORD PTR [r13+12] - mov r14d, DWORD PTR [r13+8] - mov rdx, QWORD PTR [rsp+248] - movzx ecx, sil - shr esi, 8 - punpcklqdq xmm7, xmm0 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - movd xmm1, r11d - add ebp, 256 - movq r11, xmm12 - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - xor eax, DWORD PTR [r12+rcx*4] - mov rcx, rdx - xor eax, r15d - punpckldq xmm2, xmm1 - xor rcx, 16 - movd xmm6, eax - mov rax, rdx - punpckldq xmm6, xmm0 - xor rax, 32 - punpckldq xmm6, xmm2 - xor rdx, 48 - movdqu xmm2, XMMWORD PTR [rcx+r11] - pxor xmm6, xmm7 - paddq xmm2, xmm4 - movdqu xmm1, XMMWORD PTR [rax+r11] - movdqu xmm0, XMMWORD PTR [rdx+r11] - paddq xmm0, xmm5 - movdqu XMMWORD PTR [rcx+r11], xmm0 - movdqu XMMWORD PTR [rax+r11], xmm2 - movq rcx, xmm13 - paddq xmm1, xmm7 - movdqu XMMWORD PTR [rdx+r11], xmm1 - movq rdi, xmm6 - mov r10, rdi - and r10d, 2097136 - xor edx, edx - mov rax, rcx - shl rax, 32 - movq rbx, xmm10 - xor rbx, rax - lea r9, QWORD PTR [rcx+rcx] - add r9d, edi - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - mov ecx, -2147483647 - movdqu XMMWORD PTR [r13], xmm0 - or r9, rcx - movdqa xmm0, xmm6 - movaps xmm1, xmm9 - psrldq xmm0, 8 - movq rax, xmm0 - xor rbx, QWORD PTR [r10+r11] - lea r14, QWORD PTR [r10+r11] - mov rbp, QWORD PTR [r14+8] - div r9 - shl rdx, 32 - mov eax, eax - add rdx, rax - lea r9, QWORD PTR [rdx+rdi] - movq xmm10, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm1, xmm0 - movq rdx, xmm1 - test rdx, 524287 - je sqrt_fixup_fastv2_soft_aes_sandybridge - psrlq xmm1, 19 -sqrt_fixup_fastv2_soft_aes_sandybridge_ret: - - mov r9, r10 - movdqa xmm13, xmm1 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - mov rax, rbx - mul rdi - movdqu xmm2, XMMWORD PTR [r9+r11] - movdqu xmm1, XMMWORD PTR [rcx+r11] - paddq xmm1, xmm7 - movq xmm0, rax - movq xmm3, rdx - xor rax, QWORD PTR [r11+rcx+8] - xor rdx, QWORD PTR [rcx+r11] - punpcklqdq xmm3, xmm0 - add r8, rdx - movdqu xmm0, XMMWORD PTR [r10+r11] - pxor xmm2, xmm3 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [r9+r11], xmm0 - movdqa xmm5, xmm4 - mov r9, QWORD PTR [rsp+240] - movdqa xmm4, xmm6 - add r9, rax - movdqu XMMWORD PTR [rcx+r11], xmm2 - movdqu XMMWORD PTR [r10+r11], xmm1 - mov r10, QWORD PTR [rsp+224] - movd r12d, xmm11 - mov QWORD PTR [r14], r8 - xor r8, rbx - mov rax, r8 - mov QWORD PTR [r14+8], r9 - and eax, 2097136 - xor r9, rbp - mov QWORD PTR [rsp+240], r9 - mov QWORD PTR [rsp+248], rax - sub r12d, 1 - jne cnv2_main_loop_fastv2_soft_aes_sandybridge - - ldmxcsr DWORD PTR [rsp+4] - movaps xmm6, XMMWORD PTR [rsp+16] - movaps xmm7, XMMWORD PTR [rsp+32] - movaps xmm8, XMMWORD PTR [rsp+48] - movaps xmm9, XMMWORD PTR [rsp+64] - movaps xmm10, XMMWORD PTR [rsp+80] - movaps xmm11, XMMWORD PTR [rsp+96] - movaps xmm12, XMMWORD PTR [rsp+112] - movaps xmm13, XMMWORD PTR [rsp+128] - - add rsp, 152 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_main_loop_fastv2_soft_aes_sandybridge_endp - -sqrt_fixup_fastv2_soft_aes_sandybridge: - dec rdx - mov r15d, -1022 - shl r15, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - lea rcx, [rcx+r15+1] - add rax, r15 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm1, rdx - jmp sqrt_fixup_fastv2_soft_aes_sandybridge_ret - -cnv2_main_loop_fastv2_soft_aes_sandybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/win/cnv2_main_loop_ivybridge.inc deleted file mode 100644 index aa2797c3..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_ivybridge.inc +++ /dev/null @@ -1,182 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 524288 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 64 -cnv2_main_loop_originalv2_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - movq rdx, xmm3 - test edx, 524287 - je sqrt_fixup_originalv2_ivybridge - psrlq xmm3, 19 -sqrt_fixup_originalv2_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne cnv2_main_loop_originalv2_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp cnv2_main_loop_originalv2_ivybridge_endp - -sqrt_fixup_originalv2_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp sqrt_fixup_originalv2_ivybridge_ret - -cnv2_main_loop_originalv2_ivybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_ryzen.inc b/src/crypto/asm/win/cnv2_main_loop_ryzen.inc deleted file mode 100644 index 5a0ab3b7..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 524288 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 64 -cnv2_main_loop_originalv2_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_originalv2_ryzen - shr rdi, 19 - -sqrt_fixup_originalv2_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_originalv2_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_originalv2_ryzen_endp - -sqrt_fixup_originalv2_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_originalv2_ryzen_ret - -cnv2_main_loop_originalv2_ryzen_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc deleted file mode 100644 index 64650a25..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,267 +0,0 @@ - mov QWORD PTR [rsp+8], rcx - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 152 - - stmxcsr DWORD PTR [rsp+4] - mov DWORD PTR [rsp], 24448 - ldmxcsr DWORD PTR [rsp] - - mov rax, QWORD PTR [rcx+48] - mov r10, rcx - xor rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r9, QWORD PTR [rcx+40] - xor r9, QWORD PTR [rcx+8] - movq xmm4, rax - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r11, QWORD PTR [rcx+224] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r10+72] - mov rax, QWORD PTR [r10+80] - movq xmm0, rdx - xor rax, QWORD PTR [r10+64] - - movaps XMMWORD PTR [rsp+16], xmm6 - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+48], xmm8 - movaps XMMWORD PTR [rsp+64], xmm9 - movaps XMMWORD PTR [rsp+80], xmm10 - movaps XMMWORD PTR [rsp+96], xmm11 - movaps XMMWORD PTR [rsp+112], xmm12 - movaps XMMWORD PTR [rsp+128], xmm13 - - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - - mov rax, r8 - punpcklqdq xmm4, xmm0 - and eax, 2097136 - movq xmm10, QWORD PTR [r10+96] - movq xmm0, rcx - mov rcx, QWORD PTR [r10+104] - xorps xmm9, xmm9 - mov QWORD PTR [rsp+248], rax - movq xmm12, r11 - mov QWORD PTR [rsp+240], r9 - punpcklqdq xmm5, xmm0 - movq xmm13, rcx - mov r12d, 524288 - - ALIGN 64 -cnv2_main_loop_originalv2_soft_aes_sandybridge: - movd xmm11, r12d - mov r12, QWORD PTR [r10+272] - lea r13, QWORD PTR [rax+r11] - mov esi, DWORD PTR [r13] - movq xmm0, r9 - mov r10d, DWORD PTR [r13+4] - movq xmm7, r8 - mov ebp, DWORD PTR [r13+12] - mov r14d, DWORD PTR [r13+8] - mov rdx, QWORD PTR [rsp+248] - movzx ecx, sil - shr esi, 8 - punpcklqdq xmm7, xmm0 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - movd xmm1, r11d - add ebp, 256 - movq r11, xmm12 - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - xor eax, DWORD PTR [r12+rcx*4] - mov rcx, rdx - xor eax, r15d - punpckldq xmm2, xmm1 - xor rcx, 16 - movd xmm6, eax - mov rax, rdx - punpckldq xmm6, xmm0 - xor rax, 32 - punpckldq xmm6, xmm2 - xor rdx, 48 - movdqu xmm2, XMMWORD PTR [rcx+r11] - pxor xmm6, xmm7 - paddq xmm2, xmm4 - movdqu xmm1, XMMWORD PTR [rax+r11] - movdqu xmm0, XMMWORD PTR [rdx+r11] - paddq xmm0, xmm5 - movdqu XMMWORD PTR [rcx+r11], xmm0 - movdqu XMMWORD PTR [rax+r11], xmm2 - movq rcx, xmm13 - paddq xmm1, xmm7 - movdqu XMMWORD PTR [rdx+r11], xmm1 - movq rdi, xmm6 - mov r10, rdi - and r10d, 2097136 - xor edx, edx - mov rax, rcx - shl rax, 32 - movq rbx, xmm10 - xor rbx, rax - lea r9, QWORD PTR [rcx+rcx] - add r9d, edi - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - mov ecx, -2147483647 - movdqu XMMWORD PTR [r13], xmm0 - or r9, rcx - movdqa xmm0, xmm6 - movaps xmm1, xmm9 - psrldq xmm0, 8 - movq rax, xmm0 - xor rbx, QWORD PTR [r10+r11] - lea r14, QWORD PTR [r10+r11] - mov rbp, QWORD PTR [r14+8] - div r9 - shl rdx, 32 - mov eax, eax - add rdx, rax - lea r9, QWORD PTR [rdx+rdi] - movq xmm10, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm1, xmm0 - movq rdx, xmm1 - test rdx, 524287 - je sqrt_fixup_originalv2_soft_aes_sandybridge - psrlq xmm1, 19 -sqrt_fixup_originalv2_soft_aes_sandybridge_ret: - - mov r9, r10 - movdqa xmm13, xmm1 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - mov rax, rbx - mul rdi - movdqu xmm2, XMMWORD PTR [r9+r11] - movdqu xmm1, XMMWORD PTR [rcx+r11] - paddq xmm1, xmm7 - movq xmm0, rax - movq xmm3, rdx - xor rax, QWORD PTR [r11+rcx+8] - xor rdx, QWORD PTR [rcx+r11] - punpcklqdq xmm3, xmm0 - add r8, rdx - movdqu xmm0, XMMWORD PTR [r10+r11] - pxor xmm2, xmm3 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [r9+r11], xmm0 - movdqa xmm5, xmm4 - mov r9, QWORD PTR [rsp+240] - movdqa xmm4, xmm6 - add r9, rax - movdqu XMMWORD PTR [rcx+r11], xmm2 - movdqu XMMWORD PTR [r10+r11], xmm1 - mov r10, QWORD PTR [rsp+224] - movd r12d, xmm11 - mov QWORD PTR [r14], r8 - xor r8, rbx - mov rax, r8 - mov QWORD PTR [r14+8], r9 - and eax, 2097136 - xor r9, rbp - mov QWORD PTR [rsp+240], r9 - mov QWORD PTR [rsp+248], rax - sub r12d, 1 - jne cnv2_main_loop_originalv2_soft_aes_sandybridge - - ldmxcsr DWORD PTR [rsp+4] - movaps xmm6, XMMWORD PTR [rsp+16] - movaps xmm7, XMMWORD PTR [rsp+32] - movaps xmm8, XMMWORD PTR [rsp+48] - movaps xmm9, XMMWORD PTR [rsp+64] - movaps xmm10, XMMWORD PTR [rsp+80] - movaps xmm11, XMMWORD PTR [rsp+96] - movaps xmm12, XMMWORD PTR [rsp+112] - movaps xmm13, XMMWORD PTR [rsp+128] - - add rsp, 152 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_main_loop_originalv2_soft_aes_sandybridge_endp - -sqrt_fixup_originalv2_soft_aes_sandybridge: - dec rdx - mov r15d, -1022 - shl r15, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - lea rcx, [rcx+r15+1] - add rax, r15 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm1, rdx - jmp sqrt_fixup_originalv2_soft_aes_sandybridge_ret - -cnv2_main_loop_originalv2_soft_aes_sandybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_ultralite_bulldozer.inc b/src/crypto/asm/win/cnv2_main_loop_ultralite_bulldozer.inc deleted file mode 100644 index ca48cae1..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_ultralite_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 65536 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 131056 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_ultralite_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movd r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 131056 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movd xmm0, rax - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_ultralite_bulldozer - shr rdi, 19 - -sqrt_fixup_ultralite_bulldozer_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 131056 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_ultralite_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_ultralite_bulldozer_endp - -sqrt_fixup_ultralite_bulldozer: - movd r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_ultralite_bulldozer_ret - -cnv2_main_loop_ultralite_bulldozer_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_ultralite_ivybridge.inc b/src/crypto/asm/win/cnv2_main_loop_ultralite_ivybridge.inc deleted file mode 100644 index 61aa261c..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_ultralite_ivybridge.inc +++ /dev/null @@ -1,182 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 65536 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 131056 - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 64 -cnv2_main_loop_ultralite_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 131056 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - movq rdx, xmm3 - test edx, 524287 - je sqrt_fixup_ultralite_ivybridge - psrlq xmm3, 19 -sqrt_fixup_ultralite_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 131056 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne cnv2_main_loop_ultralite_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp cnv2_main_loop_ultralite_ivybridge_endp - -sqrt_fixup_ultralite_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp sqrt_fixup_ultralite_ivybridge_ret - -cnv2_main_loop_ultralite_ivybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_ultralite_ryzen.inc b/src/crypto/asm/win/cnv2_main_loop_ultralite_ryzen.inc deleted file mode 100644 index 0933ec35..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_ultralite_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 65536 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 131056 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 64 -cnv2_main_loop_ultralite_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 131056 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_ultralite_ryzen - shr rdi, 19 - -sqrt_fixup_ultralite_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 131056 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_ultralite_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_ultralite_ryzen_endp - -sqrt_fixup_ultralite_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_ultralite_ryzen_ret - -cnv2_main_loop_ultralite_ryzen_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc deleted file mode 100644 index 408254ee..00000000 --- a/src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc +++ /dev/null @@ -1,267 +0,0 @@ - mov QWORD PTR [rsp+8], rcx - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 152 - - stmxcsr DWORD PTR [rsp+4] - mov DWORD PTR [rsp], 24448 - ldmxcsr DWORD PTR [rsp] - - mov rax, QWORD PTR [rcx+48] - mov r10, rcx - xor rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r9, QWORD PTR [rcx+40] - xor r9, QWORD PTR [rcx+8] - movq xmm4, rax - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r11, QWORD PTR [rcx+224] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r10+72] - mov rax, QWORD PTR [r10+80] - movq xmm0, rdx - xor rax, QWORD PTR [r10+64] - - movaps XMMWORD PTR [rsp+16], xmm6 - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+48], xmm8 - movaps XMMWORD PTR [rsp+64], xmm9 - movaps XMMWORD PTR [rsp+80], xmm10 - movaps XMMWORD PTR [rsp+96], xmm11 - movaps XMMWORD PTR [rsp+112], xmm12 - movaps XMMWORD PTR [rsp+128], xmm13 - - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - - mov rax, r8 - punpcklqdq xmm4, xmm0 - and eax, 131056 - movq xmm10, QWORD PTR [r10+96] - movq xmm0, rcx - mov rcx, QWORD PTR [r10+104] - xorps xmm9, xmm9 - mov QWORD PTR [rsp+248], rax - movq xmm12, r11 - mov QWORD PTR [rsp+240], r9 - punpcklqdq xmm5, xmm0 - movq xmm13, rcx - mov r12d, 65536 - - ALIGN 64 -cnv2_main_loop_ultralite_soft_aes_sandybridge: - movd xmm11, r12d - mov r12, QWORD PTR [r10+272] - lea r13, QWORD PTR [rax+r11] - mov esi, DWORD PTR [r13] - movq xmm0, r9 - mov r10d, DWORD PTR [r13+4] - movq xmm7, r8 - mov ebp, DWORD PTR [r13+12] - mov r14d, DWORD PTR [r13+8] - mov rdx, QWORD PTR [rsp+248] - movzx ecx, sil - shr esi, 8 - punpcklqdq xmm7, xmm0 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - movd xmm1, r11d - add ebp, 256 - movq r11, xmm12 - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - xor eax, DWORD PTR [r12+rcx*4] - mov rcx, rdx - xor eax, r15d - punpckldq xmm2, xmm1 - xor rcx, 16 - movd xmm6, eax - mov rax, rdx - punpckldq xmm6, xmm0 - xor rax, 32 - punpckldq xmm6, xmm2 - xor rdx, 48 - movdqu xmm2, XMMWORD PTR [rcx+r11] - pxor xmm6, xmm7 - paddq xmm2, xmm4 - movdqu xmm1, XMMWORD PTR [rax+r11] - movdqu xmm0, XMMWORD PTR [rdx+r11] - paddq xmm0, xmm5 - movdqu XMMWORD PTR [rcx+r11], xmm0 - movdqu XMMWORD PTR [rax+r11], xmm2 - movq rcx, xmm13 - paddq xmm1, xmm7 - movdqu XMMWORD PTR [rdx+r11], xmm1 - movq rdi, xmm6 - mov r10, rdi - and r10d, 131056 - xor edx, edx - mov rax, rcx - shl rax, 32 - movq rbx, xmm10 - xor rbx, rax - lea r9, QWORD PTR [rcx+rcx] - add r9d, edi - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - mov ecx, -2147483647 - movdqu XMMWORD PTR [r13], xmm0 - or r9, rcx - movdqa xmm0, xmm6 - movaps xmm1, xmm9 - psrldq xmm0, 8 - movq rax, xmm0 - xor rbx, QWORD PTR [r10+r11] - lea r14, QWORD PTR [r10+r11] - mov rbp, QWORD PTR [r14+8] - div r9 - shl rdx, 32 - mov eax, eax - add rdx, rax - lea r9, QWORD PTR [rdx+rdi] - movq xmm10, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm1, xmm0 - movq rdx, xmm1 - test rdx, 524287 - je sqrt_fixup_ultralite_soft_aes_sandybridge - psrlq xmm1, 19 -sqrt_fixup_ultralite_soft_aes_sandybridge_ret: - - mov r9, r10 - movdqa xmm13, xmm1 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - mov rax, rbx - mul rdi - movdqu xmm2, XMMWORD PTR [r9+r11] - movdqu xmm1, XMMWORD PTR [rcx+r11] - paddq xmm1, xmm7 - movq xmm0, rax - movq xmm3, rdx - xor rax, QWORD PTR [r11+rcx+8] - xor rdx, QWORD PTR [rcx+r11] - punpcklqdq xmm3, xmm0 - add r8, rdx - movdqu xmm0, XMMWORD PTR [r10+r11] - pxor xmm2, xmm3 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [r9+r11], xmm0 - movdqa xmm5, xmm4 - mov r9, QWORD PTR [rsp+240] - movdqa xmm4, xmm6 - add r9, rax - movdqu XMMWORD PTR [rcx+r11], xmm2 - movdqu XMMWORD PTR [r10+r11], xmm1 - mov r10, QWORD PTR [rsp+224] - movd r12d, xmm11 - mov QWORD PTR [r14], r8 - xor r8, rbx - mov rax, r8 - mov QWORD PTR [r14+8], r9 - and eax, 131056 - xor r9, rbp - mov QWORD PTR [rsp+240], r9 - mov QWORD PTR [rsp+248], rax - sub r12d, 1 - jne cnv2_main_loop_ultralite_soft_aes_sandybridge - - ldmxcsr DWORD PTR [rsp+4] - movaps xmm6, XMMWORD PTR [rsp+16] - movaps xmm7, XMMWORD PTR [rsp+32] - movaps xmm8, XMMWORD PTR [rsp+48] - movaps xmm9, XMMWORD PTR [rsp+64] - movaps xmm10, XMMWORD PTR [rsp+80] - movaps xmm11, XMMWORD PTR [rsp+96] - movaps xmm12, XMMWORD PTR [rsp+112] - movaps xmm13, XMMWORD PTR [rsp+128] - - add rsp, 152 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_main_loop_ultralite_soft_aes_sandybridge_endp - -sqrt_fixup_ultralite_soft_aes_sandybridge: - dec rdx - mov r15d, -1022 - shl r15, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - lea rcx, [rcx+r15+1] - add rax, r15 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm1, rdx - jmp sqrt_fixup_ultralite_soft_aes_sandybridge_ret - -cnv2_main_loop_ultralite_soft_aes_sandybridge_endp: