Move files.

This commit is contained in:
XMRig 2019-06-04 19:20:33 +07:00
parent ac43cd4f9c
commit d587eebaf2
70 changed files with 69 additions and 70 deletions

View file

@ -0,0 +1,281 @@
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3)
PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end)
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_part1):
mov rcx, [rcx]
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
FN_PREFIX(CryptonightR_soft_aes_template_mainloop):
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
FN_PREFIX(CryptonightR_soft_aes_template_part2):
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop)
FN_PREFIX(CryptonightR_soft_aes_template_part3):
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
FN_PREFIX(CryptonightR_soft_aes_template_end):

View file

@ -0,0 +1,281 @@
PUBLIC CryptonightR_soft_aes_template_part1
PUBLIC CryptonightR_soft_aes_template_mainloop
PUBLIC CryptonightR_soft_aes_template_part2
PUBLIC CryptonightR_soft_aes_template_part3
PUBLIC CryptonightR_soft_aes_template_end
ALIGN(64)
CryptonightR_soft_aes_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightR_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
CryptonightR_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightR_soft_aes_template_mainloop
CryptonightR_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightR_soft_aes_template_end:

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,536 @@
PUBLIC FN_PREFIX(CryptonightR_template_part1)
PUBLIC FN_PREFIX(CryptonightR_template_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_part2)
PUBLIC FN_PREFIX(CryptonightR_template_part3)
PUBLIC FN_PREFIX(CryptonightR_template_end)
PUBLIC FN_PREFIX(CryptonightR_template_double_part1)
PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop)
PUBLIC FN_PREFIX(CryptonightR_template_double_part2)
PUBLIC FN_PREFIX(CryptonightR_template_double_part3)
PUBLIC FN_PREFIX(CryptonightR_template_double_part4)
PUBLIC FN_PREFIX(CryptonightR_template_double_end)
ALIGN(64)
FN_PREFIX(CryptonightR_template_part1):
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
FN_PREFIX(CryptonightR_template_mainloop):
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
mov r13d, r9d
mov eax, r9d
xor r9d, 48
xor r13d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
movq r12, xmm5
movd r10d, xmm5
and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
FN_PREFIX(CryptonightR_template_part2):
lea rcx, [r10+r11]
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
mov QWORD PTR [rcx], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [rcx+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz FN_PREFIX(CryptonightR_template_mainloop)
FN_PREFIX(CryptonightR_template_part3):
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
FN_PREFIX(CryptonightR_template_end):
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_part1):
mov rdx, [rcx+8]
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
FN_PREFIX(CryptonightR_template_double_mainloop):
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movq rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
FN_PREFIX(CryptonightR_template_double_part2):
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
FN_PREFIX(CryptonightR_template_double_part3):
movq r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
mov rdi, rcx
mov r8, rax
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz FN_PREFIX(CryptonightR_template_double_mainloop)
FN_PREFIX(CryptonightR_template_double_part4):
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
FN_PREFIX(CryptonightR_template_double_end):

View file

@ -0,0 +1,536 @@
PUBLIC CryptonightR_template_part1
PUBLIC CryptonightR_template_mainloop
PUBLIC CryptonightR_template_part2
PUBLIC CryptonightR_template_part3
PUBLIC CryptonightR_template_end
PUBLIC CryptonightR_template_double_part1
PUBLIC CryptonightR_template_double_mainloop
PUBLIC CryptonightR_template_double_part2
PUBLIC CryptonightR_template_double_part3
PUBLIC CryptonightR_template_double_part4
PUBLIC CryptonightR_template_double_end
ALIGN(64)
CryptonightR_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightR_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
mov r13d, r9d
mov eax, r9d
xor r9d, 48
xor r13d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
movq r12, xmm5
movd r10d, xmm5
and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
CryptonightR_template_part2:
lea rcx, [r10+r11]
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
mov QWORD PTR [rcx], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [rcx+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightR_template_mainloop
CryptonightR_template_part3:
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightR_template_end:
ALIGN(64)
CryptonightR_template_double_part1:
mov rdx, [rcx+8]
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightR_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movq rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightR_template_double_part2:
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightR_template_double_part3:
movq r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
mov rdi, rcx
mov r8, rax
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightR_template_double_mainloop
CryptonightR_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightR_template_double_end:

View file

@ -0,0 +1,268 @@
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part1)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part2)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part3)
PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_end)
ALIGN(64)
FN_PREFIX(CryptonightWOW_soft_aes_template_part1):
mov rcx, [rcx]
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop):
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
FN_PREFIX(CryptonightWOW_soft_aes_template_part2):
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop)
FN_PREFIX(CryptonightWOW_soft_aes_template_part3):
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
FN_PREFIX(CryptonightWOW_soft_aes_template_end):

View file

@ -0,0 +1,268 @@
PUBLIC CryptonightWOW_soft_aes_template_part1
PUBLIC CryptonightWOW_soft_aes_template_mainloop
PUBLIC CryptonightWOW_soft_aes_template_part2
PUBLIC CryptonightWOW_soft_aes_template_part3
PUBLIC CryptonightWOW_soft_aes_template_end
ALIGN(64)
CryptonightWOW_soft_aes_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movq xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightWOW_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
CryptonightWOW_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightWOW_soft_aes_template_mainloop
CryptonightWOW_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightWOW_soft_aes_template_end:

View file

@ -0,0 +1,491 @@
PUBLIC FN_PREFIX(CryptonightWOW_template_part1)
PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_template_part2)
PUBLIC FN_PREFIX(CryptonightWOW_template_part3)
PUBLIC FN_PREFIX(CryptonightWOW_template_end)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4)
PUBLIC FN_PREFIX(CryptonightWOW_template_double_end)
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_part1):
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_mainloop):
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
FN_PREFIX(CryptonightWOW_template_part2):
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz FN_PREFIX(CryptonightWOW_template_mainloop)
FN_PREFIX(CryptonightWOW_template_part3):
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
FN_PREFIX(CryptonightWOW_template_end):
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_double_part1):
mov rdx, [rcx+8]
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
FN_PREFIX(CryptonightWOW_template_double_mainloop):
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
FN_PREFIX(CryptonightWOW_template_double_part2):
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
FN_PREFIX(CryptonightWOW_template_double_part3):
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz FN_PREFIX(CryptonightWOW_template_double_mainloop)
FN_PREFIX(CryptonightWOW_template_double_part4):
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
FN_PREFIX(CryptonightWOW_template_double_end):

View file

@ -0,0 +1,491 @@
PUBLIC CryptonightWOW_template_part1
PUBLIC CryptonightWOW_template_mainloop
PUBLIC CryptonightWOW_template_part2
PUBLIC CryptonightWOW_template_part3
PUBLIC CryptonightWOW_template_end
PUBLIC CryptonightWOW_template_double_part1
PUBLIC CryptonightWOW_template_double_mainloop
PUBLIC CryptonightWOW_template_double_part2
PUBLIC CryptonightWOW_template_double_part3
PUBLIC CryptonightWOW_template_double_part4
PUBLIC CryptonightWOW_template_double_end
ALIGN(64)
CryptonightWOW_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movq xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movq xmm0, r12
movq xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movq xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightWOW_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movq xmm0, r15
movq xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movq r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
CryptonightWOW_template_part2:
mov rax, r13
mul r12
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightWOW_template_mainloop
CryptonightWOW_template_part3:
movq rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightWOW_template_end:
ALIGN(64)
CryptonightWOW_template_double_part1:
mov rdx, [rcx+8]
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movq xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movq xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movq xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movq xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movq xmm0, rcx
mov r11d, 524288
movq xmm10, rax
punpcklqdq xmm10, xmm0
movq xmm14, QWORD PTR [rsp+128]
movq xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightWOW_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movq xmm0, r12
mov ecx, ebx
movq xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movq rdx, xmm6
movq xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movq xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movq rdi, xmm5
movq rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movq xmm0, rsp
movq xmm1, rsi
movq xmm2, rdi
movq xmm11, rbp
movq xmm12, r15
movq xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightWOW_template_double_part2:
movq rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movq rsi, xmm1
movq rdi, xmm2
movq rbp, xmm11
movq r15, xmm12
movq rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movq xmm1, rdx
movq xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movq rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movq xmm0, rsp
movq xmm1, rbx
movq xmm2, rsi
movq xmm11, rdi
movq xmm12, rbp
movq xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movq xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightWOW_template_double_part3:
movq rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movq rbx, xmm1
movq rsi, xmm2
movq rdi, xmm11
movq rbp, xmm12
movq r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movq xmm1, rdx
movq xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movq rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightWOW_template_double_mainloop
CryptonightWOW_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightWOW_template_double_end:

View file

@ -0,0 +1,413 @@
mov rdx, [rcx+8]
mov rcx, [rcx]
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 524288
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movq xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movq xmm4, QWORD PTR [r8+96]
and edx, 2097136
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movq xmm5, QWORD PTR [r8+104]
movq xmm7, rax
mov eax, 1
shl rax, 52
movq xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movq xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movq xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movq xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movq xmm8, rax
and ecx, 2097136
punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
ALIGN(64)
main_loop_double_sandybridge:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movq xmm0, r11
movq xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movq r11, xmm9
mov edx, r11d
and edx, 2097136
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movq xmm0, rbp
movq xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rax+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rcx+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10
and ecx, 2097136
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movq rdx, xmm5
shl rdx, 32
movq rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movq xmm0, rdx
xor rdx, [r11+r13]
movq xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
xor r8d, 32
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [r11+r13], xmm0
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 2097136
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movq r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movq r11, xmm0
psrldq xmm1, 8
movq r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movq rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movq rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movq r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js div_fix_1_sandybridge
div_fix_1_ret_sandybridge:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js div_fix_2_sandybridge
div_fix_2_ret_sandybridge:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movq r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je sqrt_fix_1_sandybridge
sqrt_fix_1_ret_sandybridge:
movq r9, xmm10
psrldq xmm1, 8
movq r8, xmm1
test r8, 524287
je sqrt_fix_2_sandybridge
sqrt_fix_2_ret_sandybridge:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
paddq xmm1, xmm11
paddq xmm3, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm0
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm3
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 2097136
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne main_loop_double_sandybridge
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_double_mainloop_asm_sandybridge_endp
div_fix_1_sandybridge:
dec rbx
add r11, rdx
jmp div_fix_1_ret_sandybridge
div_fix_2_sandybridge:
dec rdx
add r8, r9
jmp div_fix_2_ret_sandybridge
sqrt_fix_1_sandybridge:
movq r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movq xmm5, r9
punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_sandybridge
sqrt_fix_2_sandybridge:
psrldq xmm3, 8
movq r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movq xmm0, r8
punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_sandybridge
cnv2_double_mainloop_asm_sandybridge_endp:

View file

@ -0,0 +1,182 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN(64)
cnv2_main_loop_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movq r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movq xmm0, rax
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_bulldozer
shr rdi, 19
sqrt_fixup_bulldozer_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_bulldozer_endp
sqrt_fixup_bulldozer:
movq r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_bulldozer_ret
cnv2_main_loop_bulldozer_endp:

View file

@ -0,0 +1,188 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 524288
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movq xmm5, rax
xor eax, eax
mov QWORD PTR [rsp+16], rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN(64)
main_loop_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
psubq xmm3, XMMWORD PTR [rsp+16]
movq rdx, xmm3
test edx, 524287
je sqrt_fixup_ivybridge
psrlq xmm3, 19
sqrt_fixup_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne main_loop_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp cnv2_main_loop_ivybridge_endp
sqrt_fixup_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp sqrt_fixup_ivybridge_ret
cnv2_main_loop_ivybridge_endp:

View file

@ -0,0 +1,181 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN(64)
main_loop_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11
movq xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movq r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movq rax, xmm0
div r9
movq xmm0, rax
movq xmm1, rdx
punpckldq xmm0, xmm1
movq r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_ryzen
shr rdi, 19
sqrt_fixup_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne main_loop_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_ryzen_endp
sqrt_fixup_ryzen:
movq r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_ryzen_ret
cnv2_main_loop_ryzen_endp:

View file

@ -0,0 +1,413 @@
mov rdx, [rcx+8]
mov rcx, [rcx]
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 393216
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movq xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movq xmm4, QWORD PTR [r8+96]
and edx, 2097136
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movq xmm5, QWORD PTR [r8+104]
movq xmm7, rax
mov eax, 1
shl rax, 52
movq xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movq xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movq xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movq xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movq xmm8, rax
and ecx, 2097136
punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
ALIGN(64)
rwz_main_loop_double:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movq xmm0, r11
movq xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movq r11, xmm9
mov edx, r11d
and edx, 2097136
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movq xmm0, rbp
movq xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rax+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10
and ecx, 2097136
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movq rdx, xmm5
shl rdx, 32
movq rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movq xmm0, rdx
xor rdx, [r11+r13]
movq xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm3
paddq xmm1, xmm2
movdqu XMMWORD PTR [r8+r13], xmm0
xor r8d, 32
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm7
movdqu XMMWORD PTR [r11+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 2097136
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movq r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movq r11, xmm0
psrldq xmm1, 8
movq r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movq rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movq rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movq r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js rwz_div_fix_1
rwz_div_fix_1_ret:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js rwz_div_fix_2
rwz_div_fix_2_ret:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movq r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je rwz_sqrt_fix_1
rwz_sqrt_fix_1_ret:
movq r9, xmm10
psrldq xmm1, 8
movq r8, xmm1
test r8, 524287
je rwz_sqrt_fix_2
rwz_sqrt_fix_2_ret:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm3, xmm6
paddq xmm1, xmm11
paddq xmm0, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm3
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm0
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 2097136
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne rwz_main_loop_double
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp rwz_cnv2_double_mainloop_asm_endp
rwz_div_fix_1:
dec rbx
add r11, rdx
jmp rwz_div_fix_1_ret
rwz_div_fix_2:
dec rdx
add r8, r9
jmp rwz_div_fix_2_ret
rwz_sqrt_fix_1:
movq r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movq xmm5, r9
punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_1_ret
rwz_sqrt_fix_2:
psrldq xmm3, 8
movq r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movq xmm0, r8
punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_2_ret
rwz_cnv2_double_mainloop_asm_endp:

View file

@ -0,0 +1,188 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 393216
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movq xmm5, rax
xor eax, eax
mov QWORD PTR [rsp+16], rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN(64)
rwz_main_loop:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm0, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm2, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
paddq xmm1, xmm7
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
psubq xmm3, XMMWORD PTR [rsp+16]
movq rdx, xmm3
test edx, 524287
je rwz_sqrt_fixup
psrlq xmm3, 19
rwz_sqrt_fixup_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm4
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm5
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm2
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne rwz_main_loop
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp cnv2_rwz_main_loop_endp
rwz_sqrt_fixup:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp rwz_sqrt_fixup_ret
cnv2_rwz_main_loop_endp:

View file

@ -0,0 +1,73 @@
#ifdef __APPLE__
# define ALIGN(x) .align 6
#else
# define ALIGN(x) .align 64
#endif
.intel_syntax noprefix
#ifdef __APPLE__
# define FN_PREFIX(fn) _ ## fn
.text
#else
# define FN_PREFIX(fn) fn
.section .text
#endif
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cnv2_rwz_mainloop_asm)
.global FN_PREFIX(cnv2_rwz_double_mainloop_asm)
ALIGN(64)
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cn2/cnv2_main_loop_ivybridge.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv2_mainloop_ryzen_asm):
sub rsp, 48
mov rcx, rdi
#include "cn2/cnv2_main_loop_ryzen.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
sub rsp, 48
mov rcx, rdi
#include "cn2/cnv2_main_loop_bulldozer.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cn2/cnv2_double_main_loop_sandybridge.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv2_rwz_mainloop_asm):
sub rsp, 48
mov rcx, rdi
#include "cn2/cnv2_rwz_main_loop.inc"
add rsp, 48
ret 0
mov eax, 3735929054
ALIGN(64)
FN_PREFIX(cnv2_rwz_double_mainloop_asm):
sub rsp, 48
mov rcx, rdi
#include "cn2/cnv2_rwz_double_main_loop.inc"
add rsp, 48
ret 0
mov eax, 3735929054

View file

@ -0,0 +1,52 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm
PUBLIC cnv2_rwz_mainloop_asm
PUBLIC cnv2_rwz_double_mainloop_asm
ALIGN(64)
cnv2_mainloop_ivybridge_asm PROC
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ivybridge_asm ENDP
ALIGN(64)
cnv2_mainloop_ryzen_asm PROC
INCLUDE cn2/cnv2_main_loop_ryzen.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ryzen_asm ENDP
ALIGN(64)
cnv2_mainloop_bulldozer_asm PROC
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_bulldozer_asm ENDP
ALIGN(64)
cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
ret 0
mov eax, 3735929054
cnv2_double_mainloop_sandybridge_asm ENDP
ALIGN(64)
cnv2_rwz_mainloop_asm PROC
INCLUDE cn2/cnv2_rwz_main_loop.inc
ret 0
mov eax, 3735929054
cnv2_rwz_mainloop_asm ENDP
ALIGN(64)
cnv2_rwz_double_mainloop_asm PROC
INCLUDE cn2/cnv2_rwz_double_main_loop.inc
ret 0
mov eax, 3735929054
cnv2_rwz_double_mainloop_asm ENDP
_TEXT_CNV2_MAINLOOP ENDS
END

View file

@ -0,0 +1,281 @@
PUBLIC CryptonightR_soft_aes_template_part1
PUBLIC CryptonightR_soft_aes_template_mainloop
PUBLIC CryptonightR_soft_aes_template_part2
PUBLIC CryptonightR_soft_aes_template_part3
PUBLIC CryptonightR_soft_aes_template_end
ALIGN(64)
CryptonightR_soft_aes_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movd xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movd xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movd xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movd xmm10, QWORD PTR [r10+96]
movd xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movd xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movd xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightR_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movd xmm0, r9
mov r10d, DWORD PTR [r13+4]
movd xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movd r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
pxor xmm6, xmm1
pxor xmm6, xmm0
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movd rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movd rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
movaps xmm0, xmm5
psrldq xmm0, 8
movd r9d, xmm0
CryptonightR_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov edi, edi
shl rbp, 32
or rbp, rdi
xor r8, rbp
mov ebx, ebx
shl rsi, 32
or rsi, rbx
xor QWORD PTR [rsp+320], rsi
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm2
pxor xmm6, xmm1
paddq xmm1, xmm7
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm6, xmm0
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightR_soft_aes_template_mainloop
CryptonightR_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightR_soft_aes_template_end:

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,536 @@
PUBLIC CryptonightR_template_part1
PUBLIC CryptonightR_template_mainloop
PUBLIC CryptonightR_template_part2
PUBLIC CryptonightR_template_part3
PUBLIC CryptonightR_template_end
PUBLIC CryptonightR_template_double_part1
PUBLIC CryptonightR_template_double_mainloop
PUBLIC CryptonightR_template_double_part2
PUBLIC CryptonightR_template_double_part3
PUBLIC CryptonightR_template_double_part4
PUBLIC CryptonightR_template_double_end
ALIGN(64)
CryptonightR_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movd xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movd xmm0, r12
movd xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movd xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightR_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movd xmm0, r15
movd xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
mov r13d, r9d
mov eax, r9d
xor r9d, 48
xor r13d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movaps xmm3, xmm0
movdqu xmm2, XMMWORD PTR [r13+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
pxor xmm0, xmm2
pxor xmm5, xmm1
pxor xmm5, xmm0
movd r12, xmm5
movd r10d, xmm5
and r10d, 2097136
paddq xmm3, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r13+r11], xmm3
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
CryptonightR_template_part2:
lea rcx, [r10+r11]
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor rsp, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r15, rax
mov rax, r13
mul r12
add r15, rax
add rsp, rdx
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
movaps xmm3, xmm1
movdqa xmm2, XMMWORD PTR [r9+r11]
movdqa xmm0, XMMWORD PTR [r10+r11]
pxor xmm1, xmm2
pxor xmm5, xmm0
pxor xmm5, xmm1
paddq xmm3, xmm4
paddq xmm2, xmm6
paddq xmm0, xmm7
movdqu XMMWORD PTR [r9+r11], xmm0
movdqu XMMWORD PTR [r12+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm3
movdqa xmm7, xmm6
mov QWORD PTR [rcx], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [rcx+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightR_template_mainloop
CryptonightR_template_part3:
movd rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightR_template_end:
ALIGN(64)
CryptonightR_template_double_part1:
mov rdx, [rcx+8]
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movd xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movd xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movd xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movd xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movd xmm0, rcx
mov r11d, 524288
movd xmm10, rax
punpcklqdq xmm10, xmm0
movd xmm14, QWORD PTR [rsp+128]
movd xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightR_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movd xmm0, r12
mov ecx, ebx
movd xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm1
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
pxor xmm6, xmm0
movd rdx, xmm6
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movd xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm1
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
pxor xmm5, xmm0
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movd rdi, xmm5
movd rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movd xmm0, rsp
movd xmm1, rsi
movd xmm2, rdi
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightR_template_double_part2:
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r14, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r12, rax
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movd rsi, xmm1
movd rdi, xmm2
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movdqu xmm1, XMMWORD PTR [rcx+rsi]
pxor xmm6, xmm1
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
pxor xmm6, xmm2
paddq xmm2, xmm3
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
pxor xmm6, xmm0
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movd rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movd xmm0, rsp
movd xmm1, rbx
movd xmm2, rsi
movd xmm11, rdi
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightR_template_double_part3:
movd r15, xmm13
mov eax, edi
mov edx, ebp
shl rdx, 32
or rax, rdx
xor r15, rax
mov eax, ebx
mov edx, esi
shl rdx, 32
or rax, rdx
xor r13, rax
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movd rbx, xmm1
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
mov rdi, rcx
mov r8, rax
movdqu xmm1, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm1
xor ebp, 48
paddq xmm1, xmm8
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm2
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
pxor xmm5, xmm0
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movd rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightR_template_double_mainloop
CryptonightR_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightR_template_double_end:

View file

@ -0,0 +1,268 @@
PUBLIC CryptonightWOW_soft_aes_template_part1
PUBLIC CryptonightWOW_soft_aes_template_mainloop
PUBLIC CryptonightWOW_soft_aes_template_part2
PUBLIC CryptonightWOW_soft_aes_template_part3
PUBLIC CryptonightWOW_soft_aes_template_end
ALIGN(64)
CryptonightWOW_soft_aes_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 232
mov eax, [rcx+96]
mov ebx, [rcx+100]
mov esi, [rcx+104]
mov edx, [rcx+108]
mov [rsp+144], eax
mov [rsp+148], ebx
mov [rsp+152], esi
mov [rsp+156], edx
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movd xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movd xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movd xmm5, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movd xmm10, QWORD PTR [r10+96]
movd xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+328], rax
movd xmm12, r11
mov QWORD PTR [rsp+320], r9
punpcklqdq xmm5, xmm0
movd xmm13, rcx
mov r12d, 524288
ALIGN(64)
CryptonightWOW_soft_aes_template_mainloop:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movd xmm0, r9
mov r10d, DWORD PTR [r13+4]
movd xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+328]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movd r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movd rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movd rdi, xmm6
mov r10, rdi
and r10d, 2097136
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [r13], xmm0
mov ebx, [rsp+144]
mov ebp, [rsp+152]
add ebx, [rsp+148]
add ebp, [rsp+156]
shl rbp, 32
or rbx, rbp
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
mov [rsp+160], rbx
mov [rsp+168], rdi
mov [rsp+176], rbp
mov [rsp+184], r10
mov r10, rsp
mov ebx, [rsp+144]
mov esi, [rsp+148]
mov edi, [rsp+152]
mov ebp, [rsp+156]
movd esp, xmm7
movaps xmm0, xmm7
psrldq xmm0, 8
movd r15d, xmm0
movd eax, xmm4
movd edx, xmm5
CryptonightWOW_soft_aes_template_part2:
mov rsp, r10
mov [rsp+144], ebx
mov [rsp+148], esi
mov [rsp+152], edi
mov [rsp+156], ebp
mov rbx, [rsp+160]
mov rdi, [rsp+168]
mov rbp, [rsp+176]
mov r10, [rsp+184]
mov r9, r10
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movd xmm0, rax
movd xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+320]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+304]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+320], r9
mov QWORD PTR [rsp+328], rax
sub r12d, 1
jne CryptonightWOW_soft_aes_template_mainloop
CryptonightWOW_soft_aes_template_part3:
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 232
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
ret
CryptonightWOW_soft_aes_template_end:

View file

@ -0,0 +1,491 @@
PUBLIC CryptonightWOW_template_part1
PUBLIC CryptonightWOW_template_mainloop
PUBLIC CryptonightWOW_template_part2
PUBLIC CryptonightWOW_template_part3
PUBLIC CryptonightWOW_template_end
PUBLIC CryptonightWOW_template_double_part1
PUBLIC CryptonightWOW_template_double_mainloop
PUBLIC CryptonightWOW_template_double_part2
PUBLIC CryptonightWOW_template_double_part3
PUBLIC CryptonightWOW_template_double_part4
PUBLIC CryptonightWOW_template_double_end
ALIGN(64)
CryptonightWOW_template_part1:
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push r10
push r11
push r12
push r13
push r14
push r15
push rdi
sub rsp, 64
mov r12, rcx
mov r8, QWORD PTR [r12+32]
mov rdx, r12
xor r8, QWORD PTR [r12]
mov r15, QWORD PTR [r12+40]
mov r9, r8
xor r15, QWORD PTR [r12+8]
mov r11, QWORD PTR [r12+224]
mov r12, QWORD PTR [r12+56]
xor r12, QWORD PTR [rdx+24]
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm0, r12
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
movaps XMMWORD PTR [rsp], xmm9
mov r12, QWORD PTR [rdx+88]
xor r12, QWORD PTR [rdx+72]
movd xmm6, rax
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm6, xmm0
and r9d, 2097136
movd xmm0, r12
movd xmm7, rax
punpcklqdq xmm7, xmm0
mov r10d, r9d
movd xmm9, rsp
mov rsp, r8
mov r8d, 524288
mov ebx, [rdx+96]
mov esi, [rdx+100]
mov edi, [rdx+104]
mov ebp, [rdx+108]
ALIGN(64)
CryptonightWOW_template_mainloop:
movdqa xmm5, XMMWORD PTR [r9+r11]
movd xmm0, r15
movd xmm4, rsp
punpcklqdq xmm4, xmm0
lea rdx, QWORD PTR [r9+r11]
aesenc xmm5, xmm4
movd r10d, xmm5
and r10d, 2097136
mov r12d, r9d
mov eax, r9d
xor r9d, 48
xor r12d, 16
xor eax, 32
movdqu xmm0, XMMWORD PTR [r9+r11]
movdqu xmm2, XMMWORD PTR [r12+r11]
movdqu xmm1, XMMWORD PTR [rax+r11]
paddq xmm0, xmm7
paddq xmm2, xmm6
paddq xmm1, xmm4
movdqu XMMWORD PTR [r12+r11], xmm0
movd r12, xmm5
movdqu XMMWORD PTR [rax+r11], xmm2
movdqu XMMWORD PTR [r9+r11], xmm1
movdqa xmm0, xmm5
pxor xmm0, xmm6
movdqu XMMWORD PTR [rdx], xmm0
lea r13d, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or r13, rdx
xor r13, QWORD PTR [r10+r11]
mov r14, QWORD PTR [r10+r11+8]
movd eax, xmm6
movd edx, xmm7
pextrd r9d, xmm7, 2
CryptonightWOW_template_part2:
mov rax, r13
mul r12
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
mov r9d, r10d
mov r12d, r10d
xor r9d, 16
xor r12d, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [r12+r11]
xor rdx, QWORD PTR [r12+r11]
xor rax, QWORD PTR [r11+r12+8]
movdqa xmm2, XMMWORD PTR [r9+r11]
pxor xmm3, xmm2
paddq xmm7, XMMWORD PTR [r10+r11]
paddq xmm1, xmm4
paddq xmm3, xmm6
movdqu XMMWORD PTR [r9+r11], xmm7
movdqu XMMWORD PTR [r12+r11], xmm3
movdqu XMMWORD PTR [r10+r11], xmm1
movdqa xmm7, xmm6
add r15, rax
add rsp, rdx
xor r10, 48
mov QWORD PTR [r10+r11], rsp
xor rsp, r13
mov r9d, esp
mov QWORD PTR [r10+r11+8], r15
and r9d, 2097136
xor r15, r14
movdqa xmm6, xmm5
dec r8d
jnz CryptonightWOW_template_mainloop
CryptonightWOW_template_part3:
movd rsp, xmm9
mov rbx, QWORD PTR [rsp+136]
mov rbp, QWORD PTR [rsp+144]
mov rsi, QWORD PTR [rsp+152]
movaps xmm6, XMMWORD PTR [rsp+48]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+16]
movaps xmm9, XMMWORD PTR [rsp]
add rsp, 64
pop rdi
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
ret 0
CryptonightWOW_template_end:
ALIGN(64)
CryptonightWOW_template_double_part1:
mov rdx, [rcx+8]
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 320
mov r14, QWORD PTR [rcx+32]
mov r8, rcx
xor r14, QWORD PTR [rcx]
mov r12, QWORD PTR [rcx+40]
mov ebx, r14d
mov rsi, QWORD PTR [rcx+224]
and ebx, 2097136
xor r12, QWORD PTR [rcx+8]
mov rcx, QWORD PTR [rcx+56]
xor rcx, QWORD PTR [r8+24]
mov rax, QWORD PTR [r8+48]
xor rax, QWORD PTR [r8+16]
mov r15, QWORD PTR [rdx+32]
xor r15, QWORD PTR [rdx]
movd xmm0, rcx
mov rcx, QWORD PTR [r8+88]
xor rcx, QWORD PTR [r8+72]
mov r13, QWORD PTR [rdx+40]
mov rdi, QWORD PTR [rdx+224]
xor r13, QWORD PTR [rdx+8]
movaps XMMWORD PTR [rsp+160], xmm6
movaps XMMWORD PTR [rsp+176], xmm7
movaps XMMWORD PTR [rsp+192], xmm8
movaps XMMWORD PTR [rsp+208], xmm9
movaps XMMWORD PTR [rsp+224], xmm10
movaps XMMWORD PTR [rsp+240], xmm11
movaps XMMWORD PTR [rsp+256], xmm12
movaps XMMWORD PTR [rsp+272], xmm13
movaps XMMWORD PTR [rsp+288], xmm14
movaps XMMWORD PTR [rsp+304], xmm15
movd xmm7, rax
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
movaps xmm1, XMMWORD PTR [rdx+96]
movaps xmm2, XMMWORD PTR [r8+96]
movaps XMMWORD PTR [rsp], xmm1
movaps XMMWORD PTR [rsp+16], xmm2
mov r8d, r15d
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+56]
xor rcx, QWORD PTR [rdx+24]
movd xmm9, rax
mov QWORD PTR [rsp+128], rsi
mov rax, QWORD PTR [rdx+48]
xor rax, QWORD PTR [rdx+16]
punpcklqdq xmm9, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [rdx+88]
xor rcx, QWORD PTR [rdx+72]
movd xmm8, rax
mov QWORD PTR [rsp+136], rdi
mov rax, QWORD PTR [rdx+80]
xor rax, QWORD PTR [rdx+64]
punpcklqdq xmm8, xmm0
and r8d, 2097136
movd xmm0, rcx
mov r11d, 524288
movd xmm10, rax
punpcklqdq xmm10, xmm0
movd xmm14, QWORD PTR [rsp+128]
movd xmm15, QWORD PTR [rsp+136]
ALIGN(64)
CryptonightWOW_template_double_mainloop:
movdqu xmm6, XMMWORD PTR [rbx+rsi]
movd xmm0, r12
mov ecx, ebx
movd xmm3, r14
punpcklqdq xmm3, xmm0
xor ebx, 16
aesenc xmm6, xmm3
movd rdx, xmm6
movd xmm4, r15
movdqu xmm0, XMMWORD PTR [rbx+rsi]
xor ebx, 48
paddq xmm0, xmm7
movdqu xmm1, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm0
paddq xmm1, xmm3
xor ebx, 16
mov eax, ebx
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbx+rsi]
movdqu XMMWORD PTR [rbx+rsi], xmm1
paddq xmm0, xmm9
movdqu XMMWORD PTR [rax+rsi], xmm0
movdqa xmm0, xmm6
pxor xmm0, xmm7
movdqu XMMWORD PTR [rcx+rsi], xmm0
mov esi, edx
movdqu xmm5, XMMWORD PTR [r8+rdi]
and esi, 2097136
mov ecx, r8d
movd xmm0, r13
punpcklqdq xmm4, xmm0
xor r8d, 16
aesenc xmm5, xmm4
movdqu xmm0, XMMWORD PTR [r8+rdi]
xor r8d, 48
paddq xmm0, xmm8
movdqu xmm1, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm0
paddq xmm1, xmm4
xor r8d, 16
mov eax, r8d
xor rax, 32
movdqu xmm0, XMMWORD PTR [r8+rdi]
movdqu XMMWORD PTR [r8+rdi], xmm1
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rdi], xmm0
movdqa xmm0, xmm5
pxor xmm0, xmm8
movdqu XMMWORD PTR [rcx+rdi], xmm0
movd rdi, xmm5
movd rcx, xmm14
mov ebp, edi
mov r8, QWORD PTR [rcx+rsi]
mov r10, QWORD PTR [rcx+rsi+8]
lea r9, QWORD PTR [rcx+rsi]
xor esi, 16
movd xmm0, rsp
movd xmm1, rsi
movd xmm2, rdi
movd xmm11, rbp
movd xmm12, r15
movd xmm13, rdx
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp+16]
mov esi, DWORD PTR [rsp+20]
mov edi, DWORD PTR [rsp+24]
mov ebp, DWORD PTR [rsp+28]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd esp, xmm3
pextrd r15d, xmm3, 2
movd eax, xmm7
movd edx, xmm9
pextrd r9d, xmm9, 2
CryptonightWOW_template_double_part2:
movd rsp, xmm0
mov DWORD PTR [rsp+16], ebx
mov DWORD PTR [rsp+20], esi
mov DWORD PTR [rsp+24], edi
mov DWORD PTR [rsp+28], ebp
movd rsi, xmm1
movd rdi, xmm2
movd rbp, xmm11
movd r15, xmm12
movd rdx, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rbx, r8
mov rax, r8
mul rdx
and ebp, 2097136
mov r8, rax
movd xmm1, rdx
movd xmm0, r8
punpcklqdq xmm1, xmm0
pxor xmm1, XMMWORD PTR [rcx+rsi]
xor esi, 48
paddq xmm1, xmm7
movdqu xmm2, XMMWORD PTR [rsi+rcx]
xor rdx, QWORD PTR [rsi+rcx]
paddq xmm2, xmm3
xor r8, QWORD PTR [rsi+rcx+8]
movdqu XMMWORD PTR [rsi+rcx], xmm1
xor esi, 16
mov eax, esi
mov rsi, rcx
movdqu xmm0, XMMWORD PTR [rax+rcx]
movdqu XMMWORD PTR [rax+rcx], xmm2
paddq xmm0, xmm9
add r12, r8
xor rax, 32
add r14, rdx
movdqa xmm9, xmm7
movdqa xmm7, xmm6
movdqu XMMWORD PTR [rax+rcx], xmm0
mov QWORD PTR [r9+8], r12
xor r12, r10
mov QWORD PTR [r9], r14
movd rcx, xmm15
xor r14, rbx
mov r10d, ebp
mov ebx, r14d
xor ebp, 16
and ebx, 2097136
mov r8, QWORD PTR [r10+rcx]
mov r9, QWORD PTR [r10+rcx+8]
movd xmm0, rsp
movd xmm1, rbx
movd xmm2, rsi
movd xmm11, rdi
movd xmm12, rbp
movd xmm13, r15
mov [rsp+104], rcx
mov [rsp+112], r9
mov ebx, DWORD PTR [rsp]
mov esi, DWORD PTR [rsp+4]
mov edi, DWORD PTR [rsp+8]
mov ebp, DWORD PTR [rsp+12]
lea eax, [ebx+esi]
lea edx, [edi+ebp]
shl rdx, 32
or rax, rdx
xor r8, rax
movd xmm3, r8
movd esp, xmm4
pextrd r15d, xmm4, 2
movd eax, xmm8
movd edx, xmm10
pextrd r9d, xmm10, 2
CryptonightWOW_template_double_part3:
movd rsp, xmm0
mov DWORD PTR [rsp], ebx
mov DWORD PTR [rsp+4], esi
mov DWORD PTR [rsp+8], edi
mov DWORD PTR [rsp+12], ebp
movd rbx, xmm1
movd rsi, xmm2
movd rdi, xmm11
movd rbp, xmm12
movd r15, xmm13
mov rcx, [rsp+104]
mov r9, [rsp+112]
mov rax, r8
mul rdi
movd xmm1, rdx
movd xmm0, rax
punpcklqdq xmm1, xmm0
mov rdi, rcx
mov r8, rax
pxor xmm1, XMMWORD PTR [rbp+rcx]
xor ebp, 48
paddq xmm1, xmm8
xor r8, QWORD PTR [rbp+rcx+8]
xor rdx, QWORD PTR [rbp+rcx]
add r13, r8
movdqu xmm2, XMMWORD PTR [rbp+rcx]
add r15, rdx
movdqu XMMWORD PTR [rbp+rcx], xmm1
paddq xmm2, xmm4
xor ebp, 16
mov eax, ebp
xor rax, 32
movdqu xmm0, XMMWORD PTR [rbp+rcx]
movdqu XMMWORD PTR [rbp+rcx], xmm2
paddq xmm0, xmm10
movdqu XMMWORD PTR [rax+rcx], xmm0
movd rax, xmm3
movdqa xmm10, xmm8
mov QWORD PTR [r10+rcx], r15
movdqa xmm8, xmm5
xor r15, rax
mov QWORD PTR [r10+rcx+8], r13
mov r8d, r15d
xor r13, r9
and r8d, 2097136
dec r11d
jnz CryptonightWOW_template_double_mainloop
CryptonightWOW_template_double_part4:
mov rbx, QWORD PTR [rsp+400]
movaps xmm6, XMMWORD PTR [rsp+160]
movaps xmm7, XMMWORD PTR [rsp+176]
movaps xmm8, XMMWORD PTR [rsp+192]
movaps xmm9, XMMWORD PTR [rsp+208]
movaps xmm10, XMMWORD PTR [rsp+224]
movaps xmm11, XMMWORD PTR [rsp+240]
movaps xmm12, XMMWORD PTR [rsp+256]
movaps xmm13, XMMWORD PTR [rsp+272]
movaps xmm14, XMMWORD PTR [rsp+288]
movaps xmm15, XMMWORD PTR [rsp+304]
add rsp, 320
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
ret 0
CryptonightWOW_template_double_end:

View file

@ -0,0 +1,413 @@
mov rdx, [rcx+8]
mov rcx, [rcx]
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 524288
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movd xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movd xmm4, QWORD PTR [r8+96]
and edx, 2097136
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movd xmm5, QWORD PTR [r8+104]
movd xmm7, rax
mov eax, 1
shl rax, 52
movd xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movd xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movd xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movd xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movd xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movd xmm8, rax
and ecx, 2097136
punpcklqdq xmm8, xmm0
movd xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movd xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
ALIGN(64)
main_loop_double_sandybridge:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movd xmm0, r11
movd xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movd r11, xmm9
mov edx, r11d
and edx, 2097136
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movd xmm0, rbp
movd xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rax+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rcx+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movd rcx, xmm10
and ecx, 2097136
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movd rdx, xmm5
shl rdx, 32
movd rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movd xmm0, rdx
xor rdx, [r11+r13]
movd xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
xor r8d, 32
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [r11+r13], xmm0
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 2097136
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movd r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movd r11, xmm0
psrldq xmm1, 8
movd r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movd rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movd rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movd r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js div_fix_1_sandybridge
div_fix_1_ret_sandybridge:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js div_fix_2_sandybridge
div_fix_2_ret_sandybridge:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movd r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je sqrt_fix_1_sandybridge
sqrt_fix_1_ret_sandybridge:
movd r9, xmm10
psrldq xmm1, 8
movd r8, xmm1
test r8, 524287
je sqrt_fix_2_sandybridge
sqrt_fix_2_ret_sandybridge:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
paddq xmm1, xmm11
paddq xmm3, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm0
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm3
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 2097136
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne main_loop_double_sandybridge
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_double_mainloop_asm_sandybridge_endp
div_fix_1_sandybridge:
dec rbx
add r11, rdx
jmp div_fix_1_ret_sandybridge
div_fix_2_sandybridge:
dec rdx
add r8, r9
jmp div_fix_2_ret_sandybridge
sqrt_fix_1_sandybridge:
movd r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movd xmm5, r9
punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_sandybridge
sqrt_fix_2_sandybridge:
psrldq xmm3, 8
movd r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movd xmm0, r8
punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_sandybridge
cnv2_double_mainloop_asm_sandybridge_endp:

View file

@ -0,0 +1,182 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movd xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movd xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movd xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN(64)
cnv2_main_loop_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movd xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movd r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movd xmm0, rax
sqrtsd xmm1, xmm0
movd rdi, xmm1
test rdi, 524287
je sqrt_fixup_bulldozer
shr rdi, 19
sqrt_fixup_bulldozer_ret:
mov rax, rsi
mul r14
movd xmm1, rax
movd xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_bulldozer_endp
sqrt_fixup_bulldozer:
movd r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_bulldozer_ret
cnv2_main_loop_bulldozer_endp:

View file

@ -0,0 +1,188 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 524288
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movd xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movd xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movd xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movd xmm5, rax
xor eax, eax
mov QWORD PTR [rsp+16], rax
mov ax, 1023
shl rax, 52
movd xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movd xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN(64)
main_loop_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movd xmm0, r11
movd xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movd rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movd rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movd rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movd xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
psubq xmm3, XMMWORD PTR [rsp+16]
movd rdx, xmm3
test edx, 524287
je sqrt_fixup_ivybridge
psrlq xmm3, 19
sqrt_fixup_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movd xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movd xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne main_loop_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp cnv2_main_loop_ivybridge_endp
sqrt_fixup_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movd xmm3, rdx
jmp sqrt_fixup_ivybridge_ret
cnv2_main_loop_ivybridge_endp:

View file

@ -0,0 +1,181 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movd xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movd xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movd xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN(64)
main_loop_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movd xmm0, r11
movd xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movd r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movd rax, xmm0
div r9
movd xmm0, rax
movd xmm1, rdx
punpckldq xmm0, xmm1
movd r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movd rdi, xmm1
test rdi, 524287
je sqrt_fixup_ryzen
shr rdi, 19
sqrt_fixup_ryzen_ret:
mov rax, rsi
mul r14
movd xmm1, rax
movd xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne main_loop_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_ryzen_endp
sqrt_fixup_ryzen:
movd r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_ryzen_ret
cnv2_main_loop_ryzen_endp:

View file

@ -0,0 +1,413 @@
mov rdx, [rcx+8]
mov rcx, [rcx]
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 393216
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movd xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movd xmm4, QWORD PTR [r8+96]
and edx, 2097136
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movd xmm5, QWORD PTR [r8+104]
movd xmm7, rax
mov eax, 1
shl rax, 52
movd xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movd xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movd xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movd xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movd xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movd xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movd xmm8, rax
and ecx, 2097136
punpcklqdq xmm8, xmm0
movd xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movd xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
ALIGN(64)
rwz_main_loop_double:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movd xmm0, r11
movd xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movd r11, xmm9
mov edx, r11d
and edx, 2097136
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movd xmm0, rbp
movd xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rax+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movd rcx, xmm10
and ecx, 2097136
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movd rdx, xmm5
shl rdx, 32
movd rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movd xmm0, rdx
xor rdx, [r11+r13]
movd xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm3
paddq xmm1, xmm2
movdqu XMMWORD PTR [r8+r13], xmm0
xor r8d, 32
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm7
movdqu XMMWORD PTR [r11+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 2097136
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movd r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movd r11, xmm0
psrldq xmm1, 8
movd r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movd rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movd rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movd r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js rwz_div_fix_1
rwz_div_fix_1_ret:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js rwz_div_fix_2
rwz_div_fix_2_ret:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movd r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je rwz_sqrt_fix_1
rwz_sqrt_fix_1_ret:
movd r9, xmm10
psrldq xmm1, 8
movd r8, xmm1
test r8, 524287
je rwz_sqrt_fix_2
rwz_sqrt_fix_2_ret:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movd xmm0, rax
movd xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm3, xmm6
paddq xmm1, xmm11
paddq xmm0, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm3
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm0
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 2097136
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne rwz_main_loop_double
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp rwz_cnv2_double_mainloop_asm_endp
rwz_div_fix_1:
dec rbx
add r11, rdx
jmp rwz_div_fix_1_ret
rwz_div_fix_2:
dec rdx
add r8, r9
jmp rwz_div_fix_2_ret
rwz_sqrt_fix_1:
movd r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movd xmm5, r9
punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_1_ret
rwz_sqrt_fix_2:
psrldq xmm3, 8
movd r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movd xmm0, r8
punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_2_ret
rwz_cnv2_double_mainloop_asm_endp:

View file

@ -0,0 +1,188 @@
mov rcx, [rcx]
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 393216
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movd xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movd xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movd xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movd xmm5, rax
xor eax, eax
mov QWORD PTR [rsp+16], rax
mov ax, 1023
shl rax, 52
movd xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movd xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN(64)
rwz_main_loop:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movd xmm0, r11
movd xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movd rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm0, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm2, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
paddq xmm1, xmm7
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movd rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movd rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movd xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
psubq xmm3, XMMWORD PTR [rsp+16]
movd rdx, xmm3
test edx, 524287
je rwz_sqrt_fixup
psrlq xmm3, 19
rwz_sqrt_fixup_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movd xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movd xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm4
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm5
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm2
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne rwz_main_loop
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp cnv2_rwz_main_loop_endp
rwz_sqrt_fixup:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movd xmm3, rdx
jmp rwz_sqrt_fixup_ret
cnv2_rwz_main_loop_endp:

View file

@ -0,0 +1,45 @@
#define ALIGN(x) .align 64
.intel_syntax noprefix
.section .text
.global cnv2_mainloop_ivybridge_asm
.global cnv2_mainloop_ryzen_asm
.global cnv2_mainloop_bulldozer_asm
.global cnv2_double_mainloop_sandybridge_asm
.global cnv2_rwz_mainloop_asm
.global cnv2_rwz_double_mainloop_asm
ALIGN(64)
cnv2_mainloop_ivybridge_asm:
#include "../cn2/cnv2_main_loop_ivybridge.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv2_mainloop_ryzen_asm:
#include "../cn2/cnv2_main_loop_ryzen.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv2_mainloop_bulldozer_asm:
#include "../cn2/cnv2_main_loop_bulldozer.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv2_double_mainloop_sandybridge_asm:
#include "../cn2/cnv2_double_main_loop_sandybridge.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv2_rwz_mainloop_asm:
#include "cn2/cnv2_rwz_main_loop.inc"
ret 0
mov eax, 3735929054
ALIGN(64)
cnv2_rwz_double_mainloop_asm:
#include "cn2/cnv2_rwz_double_main_loop.inc"
ret 0
mov eax, 3735929054

View file

@ -0,0 +1,52 @@
_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm
PUBLIC cnv2_rwz_mainloop_asm
PUBLIC cnv2_rwz_double_mainloop_asm
ALIGN 64
cnv2_mainloop_ivybridge_asm PROC
INCLUDE cn2/cnv2_main_loop_ivybridge.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ivybridge_asm ENDP
ALIGN 64
cnv2_mainloop_ryzen_asm PROC
INCLUDE cn2/cnv2_main_loop_ryzen.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_ryzen_asm ENDP
ALIGN 64
cnv2_mainloop_bulldozer_asm PROC
INCLUDE cn2/cnv2_main_loop_bulldozer.inc
ret 0
mov eax, 3735929054
cnv2_mainloop_bulldozer_asm ENDP
ALIGN 64
cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cn2/cnv2_double_main_loop_sandybridge.inc
ret 0
mov eax, 3735929054
cnv2_double_mainloop_sandybridge_asm ENDP
ALIGN(64)
cnv2_rwz_mainloop_asm PROC
INCLUDE cn2/cnv2_rwz_main_loop.inc
ret 0
mov eax, 3735929054
cnv2_rwz_mainloop_asm ENDP
ALIGN(64)
cnv2_rwz_double_mainloop_asm PROC
INCLUDE cn2/cnv2_rwz_double_main_loop.inc
ret 0
mov eax, 3735929054
cnv2_rwz_double_mainloop_asm ENDP
_TEXT_CNV2_MAINLOOP ENDS
END