Use new style method to call ASM functions for cn/2 & added bulldozer ASM code.
This commit is contained in:
parent
7574bfab60
commit
ef2e8bed6e
16 changed files with 325 additions and 150 deletions
|
@ -94,7 +94,7 @@
|
|||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN 16
|
||||
ALIGN(64)
|
||||
main_loop_double_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
180
crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
Normal file
180
crypto/asm/cn2/cnv2_main_loop_bulldozer.inc
Normal file
|
@ -0,0 +1,180 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_main_loop_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movq r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movq xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_bulldozer_endp
|
||||
|
||||
sqrt_fixup_bulldozer:
|
||||
movq r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_bulldozer_endp:
|
|
@ -50,7 +50,7 @@
|
|||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN 16
|
||||
ALIGN(64)
|
||||
main_loop_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
|
@ -45,7 +45,7 @@
|
|||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
ALIGN(64)
|
||||
main_loop_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
|
@ -1,4 +1,8 @@
|
|||
#define ALIGN .align
|
||||
#ifdef __APPLE__
|
||||
# define ALIGN(x) .align 6
|
||||
#else
|
||||
# define ALIGN(x) .align 64
|
||||
#endif
|
||||
.intel_syntax noprefix
|
||||
#ifdef __APPLE__
|
||||
# define FN_PREFIX(fn) _ ## fn
|
||||
|
@ -9,29 +13,42 @@
|
|||
#endif
|
||||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
|
||||
ALIGN 16
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_ivybridge.inc"
|
||||
#include "cn2/cnv2_main_loop_ivybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN 16
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_ryzen.inc"
|
||||
#include "cn2/cnv2_main_loop_ryzen.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN 16
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn2/cnv2_main_loop_bulldozer.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
mov rdx, rsi
|
||||
#include "cnv2_double_main_loop_sandybridge.inc"
|
||||
#include "cn2/cnv2_double_main_loop_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
mov eax, 3735929054
|
|
@ -1,24 +1,35 @@
|
|||
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||
|
||||
ALIGN 64
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_ivybridge.inc
|
||||
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ivybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ryzen_asm PROC
|
||||
INCLUDE cnv2_main_loop_ryzen.inc
|
||||
INCLUDE cn2/cnv2_main_loop_ryzen.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cnv2_double_main_loop_sandybridge.inc
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_mainloop_bulldozer_asm ENDP
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||
|
||||
_TEXT_CNV2_MAINLOOP ENDS
|
31
crypto/asm/win64/cn_main_loop.S
Normal file
31
crypto/asm/win64/cn_main_loop.S
Normal file
|
@ -0,0 +1,31 @@
|
|||
#define ALIGN(x) .align 64
|
||||
.intel_syntax noprefix
|
||||
.section .text
|
||||
.global cnv2_mainloop_ivybridge_asm
|
||||
.global cnv2_mainloop_ryzen_asm
|
||||
.global cnv2_mainloop_bulldozer_asm
|
||||
.global cnv2_double_mainloop_sandybridge_asm
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ivybridge_asm:
|
||||
#include "../cn2/cnv2_main_loop_ivybridge.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_ryzen_asm:
|
||||
#include "../cn2/cnv2_main_loop_ryzen.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_mainloop_bulldozer_asm:
|
||||
#include "../cn2/cnv2_main_loop_bulldozer.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
||||
|
||||
ALIGN(64)
|
||||
cnv2_double_mainloop_sandybridge_asm:
|
||||
#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
|
||||
ret 0
|
||||
mov eax, 3735929054
|
|
@ -1,21 +0,0 @@
|
|||
#define ALIGN .align
|
||||
.intel_syntax noprefix
|
||||
.section .text
|
||||
.global cnv2_mainloop_ivybridge_asm
|
||||
.global cnv2_mainloop_ryzen_asm
|
||||
.global cnv2_double_mainloop_sandybridge_asm
|
||||
|
||||
ALIGN 16
|
||||
cnv2_mainloop_ivybridge_asm:
|
||||
#include "../cnv2_main_loop_ivybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_mainloop_ryzen_asm:
|
||||
#include "../cnv2_main_loop_ryzen.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_double_mainloop_sandybridge_asm:
|
||||
#include "../cnv2_double_main_loop_sandybridge.inc"
|
||||
ret 0
|
Loading…
Add table
Add a link
Reference in a new issue