Dataset initialization with AVX2 (WIP)

This commit is contained in:
SChernykh 2020-12-18 14:53:54 +01:00
parent 6b21a51a2f
commit 515a85e66c
17 changed files with 721 additions and 90 deletions

View file

@ -0,0 +1,28 @@
r0_avx2_increments:
db 2,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0
mul_hi_avx2_data:
db 0,0,0,0,1,0,0,0
r0_avx2_mul:
;#/ 6364136223846793005
db 45, 127, 149, 76, 45, 244, 81, 88
r1_avx2_add:
;#/ 9298411001130361340
db 252, 161, 245, 89, 138, 151, 10, 129
r2_avx2_add:
;#/ 12065312585734608966
db 70, 216, 194, 56, 223, 153, 112, 167
r3_avx2_add:
;#/ 9306329213124626780
db 92, 73, 34, 191, 28, 185, 38, 129
r4_avx2_add:
;#/ 5281919268842080866
db 98, 138, 159, 23, 151, 37, 77, 73
r5_avx2_add:
;#/ 10536153434571861004
db 12, 236, 170, 206, 185, 239, 55, 146
r6_avx2_add:
;#/ 3398623926847679864
db 120, 45, 230, 108, 116, 86, 42, 47
r7_avx2_add:
;#/ 9549104520008361294
db 78, 229, 44, 182, 247, 59, 133, 132

View file

@ -0,0 +1,31 @@
add rsp, 32
pop r9
movdqu xmm0, xmmword ptr [rsp]
movdqu xmm1, xmmword ptr [rsp + 16]
movdqu xmm2, xmmword ptr [rsp + 32]
movdqu xmm3, xmmword ptr [rsp + 48]
movdqu xmm4, xmmword ptr [rsp + 64]
movdqu xmm5, xmmword ptr [rsp + 80]
movdqu xmm6, xmmword ptr [rsp + 96]
movdqu xmm7, xmmword ptr [rsp + 112]
movdqu xmm8, xmmword ptr [rsp + 128]
movdqu xmm9, xmmword ptr [rsp + 144]
movdqu xmm10, xmmword ptr [rsp + 160]
movdqu xmm11, xmmword ptr [rsp + 176]
movdqu xmm12, xmmword ptr [rsp + 192]
movdqu xmm13, xmmword ptr [rsp + 208]
movdqu xmm14, xmmword ptr [rsp + 224]
movdqu xmm15, xmmword ptr [rsp + 240]
vzeroupper
add rsp, 256
pop r15
pop r14
pop r13
pop r12
pop rsi
pop rdi
pop rbp
pop rbx
ret

View file

@ -0,0 +1,37 @@
;# prefetch RandomX dataset lines
prefetchnta byte ptr [rsi]
prefetchnta byte ptr [rsi+64]
prefetchnta byte ptr [rsi+128]
prefetchnta byte ptr [rsi+192]
prefetchnta byte ptr [rsi+256]
;# prefetch RandomX cache lines
mov rbx, rbp
and rbx, RANDOMX_CACHE_MASK
shl rbx, 6
add rbx, rdi
prefetchnta byte ptr [rbx]
lea rax, [rbp+1]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp], rax
lea rax, [rbp+2]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+8], rax
lea rax, [rbp+3]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+16], rax
lea rax, [rbp+4]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
prefetchnta byte ptr [rax]
mov [rsp+24], rax

View file

@ -0,0 +1,38 @@
mov qword ptr [rsi+0], r8
vpunpcklqdq ymm8, ymm0, ymm1
mov qword ptr [rsi+8], r9
vpunpcklqdq ymm9, ymm2, ymm3
mov qword ptr [rsi+16], r10
vpunpcklqdq ymm10, ymm4, ymm5
mov qword ptr [rsi+24], r11
vpunpcklqdq ymm11, ymm6, ymm7
mov qword ptr [rsi+32], r12
vpunpckhqdq ymm12, ymm0, ymm1
mov qword ptr [rsi+40], r13
vpunpckhqdq ymm13, ymm2, ymm3
mov qword ptr [rsi+48], r14
vpunpckhqdq ymm14, ymm4, ymm5
mov qword ptr [rsi+56], r15
vpunpckhqdq ymm15, ymm6, ymm7
vperm2i128 ymm0, ymm8, ymm9, 32
vperm2i128 ymm1, ymm10, ymm11, 32
vmovdqu ymmword ptr [rsi+64], ymm0
vmovdqu ymmword ptr [rsi+96], ymm1
vperm2i128 ymm2, ymm12, ymm13, 32
vperm2i128 ymm3, ymm14, ymm15, 32
vmovdqu ymmword ptr [rsi+128], ymm2
vmovdqu ymmword ptr [rsi+160], ymm3
vperm2i128 ymm4, ymm8, ymm9, 49
vperm2i128 ymm5, ymm10, ymm11, 49
vmovdqu ymmword ptr [rsi+192], ymm4
vmovdqu ymmword ptr [rsi+224], ymm5
vperm2i128 ymm6, ymm12, ymm13, 49
vperm2i128 ymm7, ymm14, ymm15, 49
vmovdqu ymmword ptr [rsi+256], ymm6
vmovdqu ymmword ptr [rsi+288], ymm7
add rbp, 5
add rsi, 320
cmp rbp, qword ptr [rsp+32]
db 15, 130, 0, 0, 0, 0 ;# jb rel32

View file

@ -0,0 +1,27 @@
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
;# save all XMM registers just to be safe for all calling conventions
sub rsp, 256
movdqu xmmword ptr [rsp], xmm0
movdqu xmmword ptr [rsp + 16], xmm1
movdqu xmmword ptr [rsp + 32], xmm2
movdqu xmmword ptr [rsp + 48], xmm3
movdqu xmmword ptr [rsp + 64], xmm4
movdqu xmmword ptr [rsp + 80], xmm5
movdqu xmmword ptr [rsp + 96], xmm6
movdqu xmmword ptr [rsp + 112], xmm7
movdqu xmmword ptr [rsp + 128], xmm8
movdqu xmmword ptr [rsp + 144], xmm9
movdqu xmmword ptr [rsp + 160], xmm10
movdqu xmmword ptr [rsp + 176], xmm11
movdqu xmmword ptr [rsp + 192], xmm12
movdqu xmmword ptr [rsp + 208], xmm13
movdqu xmmword ptr [rsp + 224], xmm14
movdqu xmmword ptr [rsp + 240], xmm15

View file

@ -0,0 +1,50 @@
sub rsp, 40
mov [rsp], rbx
vmovdqu ymmword ptr [rsp+8], ymm14
mov rax, [rsp+40]
mov rbx, [rsp+48]
mov rcx, [rsp+56]
mov rdx, [rsp+64]
vmovdqu ymm8, ymmword ptr [rax] ;# ymm8 = r0[1], r1[1], r2[1], r3[1]
vmovdqu ymm9, ymmword ptr [rbx] ;# ymm9 = r0[2], r1[2], r2[2], r3[2]
vmovdqu ymm10, ymmword ptr [rcx] ;# ymm10 = r0[3], r1[3], r2[3], r3[3]
vmovdqu ymm11, ymmword ptr [rdx] ;# ymm11 = r0[4], r1[4], r2[4], r3[4]
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r0[1], r0[2], r2[1], r2[2]
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r0[3], r0[4], r2[3], r2[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r0[1], r0[2], r0[3], r0[4]
vpxor ymm0, ymm0, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r2[1], r2[2], r2[3], r2[4]
vpxor ymm2, ymm2, ymm14
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r1[1], r1[2], r3[1], r3[2]
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r1[3], r1[4], r3[3], r3[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r1[1], r1[2], r1[3], r1[4]
vpxor ymm1, ymm1, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r3[1], r3[2], r3[3], r3[4]
vpxor ymm3, ymm3, ymm14
vmovdqu ymm8, ymmword ptr [rax+32] ;# ymm8 = r4[1], r5[1], r6[1], r7[1]
vmovdqu ymm9, ymmword ptr [rbx+32] ;# ymm9 = r4[2], r5[2], r6[2], r7[2]
vmovdqu ymm10, ymmword ptr [rcx+32] ;# ymm10 = r4[3], r5[3], r6[3], r7[3]
vmovdqu ymm11, ymmword ptr [rdx+32] ;# ymm11 = r4[4], r5[4], r6[4], r7[4]
vpunpcklqdq ymm12, ymm8, ymm9 ;# ymm12 = r4[1], r4[2], r6[1], r6[2]
vpunpcklqdq ymm13, ymm10, ymm11 ;# ymm13 = r4[3], r4[4], r6[3], r6[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r4[1], r4[2], r4[3], r4[4]
vpxor ymm4, ymm4, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r6[1], r6[2], r6[3], r6[4]
vpxor ymm6, ymm6, ymm14
vpunpckhqdq ymm12, ymm8, ymm9 ;# ymm12 = r5[1], r5[2], r7[1], r7[2]
vpunpckhqdq ymm13, ymm10, ymm11 ;# ymm13 = r5[3], r5[4], r7[3], r7[4]
vperm2i128 ymm14, ymm12, ymm13, 32 ;# ymm14 = r5[1], r5[2], r5[3], r5[4]
vpxor ymm5, ymm5, ymm14
vperm2i128 ymm14, ymm12, ymm13, 49 ;# ymm14 = r7[1], r7[2], r7[3], r7[4]
vpxor ymm7, ymm7, ymm14
mov rbx, [rsp]
vmovdqu ymm14, ymmword ptr [rsp+8]
add rsp, 40

View file

@ -0,0 +1,29 @@
vmovdqu ymmword ptr [rsp], ymm0
mov rax, [rsp]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+8]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+8], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+16]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+16], rax
prefetchnta byte ptr [rax]
mov rax, [rsp+24]
and rax, RANDOMX_CACHE_MASK
shl rax, 6
add rax, rdi
mov [rsp+24], rax
prefetchnta byte ptr [rax]