353 lines
9.5 KiB
ArmAsm
353 lines
9.5 KiB
ArmAsm
# Copyright (c) 2018-2019, tevador <tevador@gmail.com>
|
|
#
|
|
# All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of the copyright holder nor the
|
|
# names of its contributors may be used to endorse or promote products
|
|
# derived from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
|
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
.intel_syntax noprefix
|
|
#if defined(__APPLE__)
|
|
.text
|
|
#define DECL(x) _##x
|
|
#else
|
|
.section .text
|
|
#define DECL(x) x
|
|
#endif
|
|
|
|
#if defined(__WIN32__) || defined(__CYGWIN__)
|
|
#define WINABI
|
|
#endif
|
|
|
|
.global DECL(randomx_prefetch_scratchpad)
|
|
.global DECL(randomx_prefetch_scratchpad_bmi2)
|
|
.global DECL(randomx_prefetch_scratchpad_end)
|
|
.global DECL(randomx_program_prologue)
|
|
.global DECL(randomx_program_prologue_first_load)
|
|
.global DECL(randomx_program_imul_rcp_store)
|
|
.global DECL(randomx_program_loop_begin)
|
|
.global DECL(randomx_program_loop_load)
|
|
.global DECL(randomx_program_loop_load_xop)
|
|
.global DECL(randomx_program_start)
|
|
.global DECL(randomx_program_read_dataset)
|
|
.global DECL(randomx_program_read_dataset_sshash_init)
|
|
.global DECL(randomx_program_read_dataset_sshash_fin)
|
|
.global DECL(randomx_program_loop_store)
|
|
.global DECL(randomx_program_loop_end)
|
|
.global DECL(randomx_dataset_init)
|
|
.global DECL(randomx_dataset_init_avx2_prologue)
|
|
.global DECL(randomx_dataset_init_avx2_loop_end)
|
|
.global DECL(randomx_dataset_init_avx2_epilogue)
|
|
.global DECL(randomx_dataset_init_avx2_ssh_load)
|
|
.global DECL(randomx_dataset_init_avx2_ssh_prefetch)
|
|
.global DECL(randomx_program_epilogue)
|
|
.global DECL(randomx_sshash_load)
|
|
.global DECL(randomx_sshash_prefetch)
|
|
.global DECL(randomx_sshash_end)
|
|
.global DECL(randomx_sshash_init)
|
|
.global DECL(randomx_program_end)
|
|
.global DECL(randomx_reciprocal_fast)
|
|
|
|
#define RANDOMX_SCRATCHPAD_MASK 2097088
|
|
#define RANDOMX_DATASET_BASE_MASK 2147483584
|
|
#define RANDOMX_CACHE_MASK 4194303
|
|
|
|
#define db .byte
|
|
|
|
DECL(randomx_prefetch_scratchpad):
|
|
mov rdx, rax
|
|
and eax, RANDOMX_SCRATCHPAD_MASK
|
|
prefetcht0 [rsi+rax]
|
|
ror rdx, 32
|
|
and edx, RANDOMX_SCRATCHPAD_MASK
|
|
prefetcht0 [rsi+rdx]
|
|
|
|
DECL(randomx_prefetch_scratchpad_bmi2):
|
|
rorx rdx, rax, 32
|
|
and eax, RANDOMX_SCRATCHPAD_MASK
|
|
prefetcht0 [rsi+rax]
|
|
and edx, RANDOMX_SCRATCHPAD_MASK
|
|
prefetcht0 [rsi+rdx]
|
|
|
|
DECL(randomx_prefetch_scratchpad_end):
|
|
|
|
.balign 64
|
|
DECL(randomx_program_prologue):
|
|
#if defined(WINABI)
|
|
#include "asm/program_prologue_win64.inc"
|
|
#else
|
|
#include "asm/program_prologue_linux.inc"
|
|
#endif
|
|
movapd xmm13, xmmword ptr [mantissaMask+rip]
|
|
movapd xmm14, xmmword ptr [exp240+rip]
|
|
movapd xmm15, xmmword ptr [scaleMask+rip]
|
|
|
|
DECL(randomx_program_prologue_first_load):
|
|
mov rdx, rax
|
|
and eax, RANDOMX_SCRATCHPAD_MASK
|
|
ror rdx, 32
|
|
and edx, RANDOMX_SCRATCHPAD_MASK
|
|
sub rsp, 40
|
|
mov dword ptr [rsp], 0x9FC0
|
|
mov dword ptr [rsp+4], 0xBFC0
|
|
mov dword ptr [rsp+8], 0xDFC0
|
|
mov dword ptr [rsp+12], 0xFFC0
|
|
mov dword ptr [rsp+32], -1
|
|
nop
|
|
nop
|
|
nop
|
|
jmp DECL(randomx_program_imul_rcp_store)
|
|
|
|
.balign 64
|
|
#include "asm/program_xmm_constants.inc"
|
|
|
|
DECL(randomx_program_imul_rcp_store):
|
|
#include "asm/program_imul_rcp_store.inc"
|
|
jmp DECL(randomx_program_loop_begin)
|
|
|
|
.balign 64
|
|
DECL(randomx_program_loop_begin):
|
|
nop
|
|
|
|
DECL(randomx_program_loop_load):
|
|
#include "asm/program_loop_load.inc"
|
|
|
|
DECL(randomx_program_loop_load_xop):
|
|
#include "asm/program_loop_load_xop.inc"
|
|
|
|
DECL(randomx_program_start):
|
|
nop
|
|
|
|
DECL(randomx_program_read_dataset):
|
|
#include "asm/program_read_dataset.inc"
|
|
|
|
DECL(randomx_program_read_dataset_sshash_init):
|
|
#include "asm/program_read_dataset_sshash_init.inc"
|
|
|
|
DECL(randomx_program_read_dataset_sshash_fin):
|
|
#include "asm/program_read_dataset_sshash_fin.inc"
|
|
|
|
DECL(randomx_program_loop_store):
|
|
#include "asm/program_loop_store.inc"
|
|
|
|
DECL(randomx_program_loop_end):
|
|
nop
|
|
|
|
.balign 64
|
|
DECL(randomx_dataset_init):
|
|
push rbx
|
|
push rbp
|
|
push r12
|
|
push r13
|
|
push r14
|
|
push r15
|
|
#if defined(WINABI)
|
|
push rdi
|
|
push rsi
|
|
mov rdi, qword ptr [rcx] ;# cache->memory
|
|
mov rsi, rdx ;# dataset
|
|
mov rbp, r8 ;# block index
|
|
push r9 ;# max. block index
|
|
#else
|
|
mov rdi, qword ptr [rdi] ;# cache->memory
|
|
;# dataset in rsi
|
|
mov rbp, rdx ;# block index
|
|
push rcx ;# max. block index
|
|
#endif
|
|
init_block_loop:
|
|
prefetchw byte ptr [rsi]
|
|
mov rbx, rbp
|
|
.byte 232 ;# 0xE8 = call
|
|
;# .set CALL_LOC,
|
|
.int 32768 - (call_offset - DECL(randomx_dataset_init))
|
|
call_offset:
|
|
mov qword ptr [rsi+0], r8
|
|
mov qword ptr [rsi+8], r9
|
|
mov qword ptr [rsi+16], r10
|
|
mov qword ptr [rsi+24], r11
|
|
mov qword ptr [rsi+32], r12
|
|
mov qword ptr [rsi+40], r13
|
|
mov qword ptr [rsi+48], r14
|
|
mov qword ptr [rsi+56], r15
|
|
add rbp, 1
|
|
add rsi, 64
|
|
cmp rbp, qword ptr [rsp]
|
|
jb init_block_loop
|
|
pop rax
|
|
#if defined(WINABI)
|
|
pop rsi
|
|
pop rdi
|
|
#endif
|
|
pop r15
|
|
pop r14
|
|
pop r13
|
|
pop r12
|
|
pop rbp
|
|
pop rbx
|
|
ret
|
|
|
|
.balign 64
|
|
DECL(randomx_dataset_init_avx2_prologue):
|
|
#include "asm/program_sshash_avx2_save_registers.inc"
|
|
|
|
#if defined(WINABI)
|
|
mov rdi, qword ptr [rcx] ;# cache->memory
|
|
mov rsi, rdx ;# dataset
|
|
mov rbp, r8 ;# block index
|
|
push r9 ;# max. block index
|
|
#else
|
|
mov rdi, qword ptr [rdi] ;# cache->memory
|
|
;# dataset in rsi
|
|
mov rbp, rdx ;# block index
|
|
push rcx ;# max. block index
|
|
#endif
|
|
sub rsp, 40
|
|
|
|
jmp randomx_dataset_init_avx2_prologue_loop_begin
|
|
#include "asm/program_sshash_avx2_constants.inc"
|
|
|
|
.balign 64
|
|
randomx_dataset_init_avx2_prologue_loop_begin:
|
|
#include "asm/program_sshash_avx2_loop_begin.inc"
|
|
|
|
;# init integer registers (lane 0)
|
|
lea r8, [rbp+1]
|
|
imul r8, qword ptr [r0_avx2_mul+rip]
|
|
mov r9, qword ptr [r1_avx2_add+rip]
|
|
xor r9, r8
|
|
mov r10, qword ptr [r2_avx2_add+rip]
|
|
xor r10, r8
|
|
mov r11, qword ptr [r3_avx2_add+rip]
|
|
xor r11, r8
|
|
mov r12, qword ptr [r4_avx2_add+rip]
|
|
xor r12, r8
|
|
mov r13, qword ptr [r5_avx2_add+rip]
|
|
xor r13, r8
|
|
mov r14, qword ptr [r6_avx2_add+rip]
|
|
xor r14, r8
|
|
mov r15, qword ptr [r7_avx2_add+rip]
|
|
xor r15, r8
|
|
|
|
;# init AVX registers (lanes 1-4)
|
|
mov qword ptr [rsp+32], rbp
|
|
vbroadcastsd ymm0, qword ptr [rsp+32]
|
|
vpaddq ymm0, ymm0, ymmword ptr [r0_avx2_increments+rip]
|
|
|
|
;# ymm0 *= r0_avx2_mul
|
|
vbroadcastsd ymm1, qword ptr [r0_avx2_mul+rip]
|
|
vpsrlq ymm8, ymm0, 32
|
|
vpsrlq ymm9, ymm1, 32
|
|
vpmuludq ymm10, ymm0, ymm1
|
|
vpmuludq ymm11, ymm9, ymm0
|
|
vpmuludq ymm0, ymm8, ymm1
|
|
vpsllq ymm11, ymm11, 32
|
|
vpsllq ymm0, ymm0, 32
|
|
vpaddq ymm10, ymm10, ymm11
|
|
vpaddq ymm0, ymm10, ymm0
|
|
|
|
vbroadcastsd ymm1, qword ptr [r1_avx2_add+rip]
|
|
vpxor ymm1, ymm0, ymm1
|
|
vbroadcastsd ymm2, qword ptr [r2_avx2_add+rip]
|
|
vpxor ymm2, ymm0, ymm2
|
|
vbroadcastsd ymm3, qword ptr [r3_avx2_add+rip]
|
|
vpxor ymm3, ymm0, ymm3
|
|
vbroadcastsd ymm4, qword ptr [r4_avx2_add+rip]
|
|
vpxor ymm4, ymm0, ymm4
|
|
vbroadcastsd ymm5, qword ptr [r5_avx2_add+rip]
|
|
vpxor ymm5, ymm0, ymm5
|
|
vbroadcastsd ymm6, qword ptr [r6_avx2_add+rip]
|
|
vpxor ymm6, ymm0, ymm6
|
|
vbroadcastsd ymm7, qword ptr [r7_avx2_add+rip]
|
|
vpxor ymm7, ymm0, ymm7
|
|
|
|
vbroadcastsd ymm15, qword ptr [mul_hi_avx2_data+rip] ;# carry_bit (bit 32)
|
|
vpsllq ymm14, ymm15, 31 ;# sign64 (bit 63)
|
|
|
|
;# generated SuperscalarHash code goes here
|
|
|
|
DECL(randomx_dataset_init_avx2_loop_end):
|
|
#include "asm/program_sshash_avx2_loop_end.inc"
|
|
|
|
DECL(randomx_dataset_init_avx2_epilogue):
|
|
#include "asm/program_sshash_avx2_epilogue.inc"
|
|
|
|
DECL(randomx_dataset_init_avx2_ssh_load):
|
|
#include "asm/program_sshash_avx2_ssh_load.inc"
|
|
|
|
DECL(randomx_dataset_init_avx2_ssh_prefetch):
|
|
#include "asm/program_sshash_avx2_ssh_prefetch.inc"
|
|
|
|
.balign 64
|
|
DECL(randomx_program_epilogue):
|
|
#include "asm/program_epilogue_store.inc"
|
|
#if defined(WINABI)
|
|
#include "asm/program_epilogue_win64.inc"
|
|
#else
|
|
#include "asm/program_epilogue_linux.inc"
|
|
#endif
|
|
|
|
.balign 64
|
|
DECL(randomx_sshash_load):
|
|
#include "asm/program_sshash_load.inc"
|
|
|
|
DECL(randomx_sshash_prefetch):
|
|
#include "asm/program_sshash_prefetch.inc"
|
|
|
|
DECL(randomx_sshash_end):
|
|
nop
|
|
|
|
.balign 64
|
|
DECL(randomx_sshash_init):
|
|
lea r8, [rbx+1]
|
|
#include "asm/program_sshash_prefetch.inc"
|
|
imul r8, qword ptr [r0_mul+rip]
|
|
mov r9, qword ptr [r1_add+rip]
|
|
xor r9, r8
|
|
mov r10, qword ptr [r2_add+rip]
|
|
xor r10, r8
|
|
mov r11, qword ptr [r3_add+rip]
|
|
xor r11, r8
|
|
mov r12, qword ptr [r4_add+rip]
|
|
xor r12, r8
|
|
mov r13, qword ptr [r5_add+rip]
|
|
xor r13, r8
|
|
mov r14, qword ptr [r6_add+rip]
|
|
xor r14, r8
|
|
mov r15, qword ptr [r7_add+rip]
|
|
xor r15, r8
|
|
jmp DECL(randomx_program_end)
|
|
|
|
.balign 64
|
|
#include "asm/program_sshash_constants.inc"
|
|
|
|
.balign 64
|
|
DECL(randomx_program_end):
|
|
nop
|
|
|
|
DECL(randomx_reciprocal_fast):
|
|
#if !defined(WINABI)
|
|
mov rcx, rdi
|
|
#endif
|
|
#include "asm/randomx_reciprocal.inc"
|
|
|
|
#if defined(__linux__) && defined(__ELF__)
|
|
.section .note.GNU-stack,"",%progbits
|
|
#endif
|