ARM64 JIT: don't use x18 register

From https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms
> The platforms reserve register x18. Don’t use this register.

This PR fixes invalid hashes when running on Apple silicon with the latest macOS SDK.
This commit is contained in:
SChernykh 2023-10-19 17:39:25 +02:00
parent 08901a9a4b
commit 5e66efabcf
2 changed files with 75 additions and 77 deletions

View file

@ -72,9 +72,9 @@
# x15 -> "r7"
# x16 -> spAddr0
# x17 -> spAddr1
# x18 -> temporary
# x18 -> unused (platform register, don't touch it)
# x19 -> temporary
# x20 -> literal for IMUL_RCP
# x20 -> temporary
# x21 -> literal for IMUL_RCP
# x22 -> literal for IMUL_RCP
# x23 -> literal for IMUL_RCP
@ -109,7 +109,7 @@ DECL(randomx_program_aarch64):
# Save callee-saved registers
sub sp, sp, 192
stp x16, x17, [sp]
stp x18, x19, [sp, 16]
str x19, [sp, 16]
stp x20, x21, [sp, 32]
stp x22, x23, [sp, 48]
stp x24, x25, [sp, 64]
@ -164,7 +164,6 @@ DECL(randomx_program_aarch64):
# Read literals
ldr x0, literal_x0
ldr x11, literal_x11
ldr x20, literal_x20
ldr x21, literal_x21
ldr x22, literal_x22
ldr x23, literal_x23
@ -196,11 +195,11 @@ DECL(randomx_program_aarch64):
DECL(randomx_program_aarch64_main_loop):
# spAddr0 = spMix1 & ScratchpadL3Mask64;
# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
lsr x18, x10, 32
lsr x20, x10, 32
# Actual mask will be inserted by JIT compiler
and w16, w10, 1
and w17, w18, 1
and w17, w20, 1
# x16 = scratchpad + spAddr0
# x17 = scratchpad + spAddr1
@ -208,31 +207,31 @@ DECL(randomx_program_aarch64_main_loop):
add x17, x17, x2
# xor integer registers with scratchpad data (spAddr0)
ldp x18, x19, [x16]
eor x4, x4, x18
ldp x20, x19, [x16]
eor x4, x4, x20
eor x5, x5, x19
ldp x18, x19, [x16, 16]
eor x6, x6, x18
ldp x20, x19, [x16, 16]
eor x6, x6, x20
eor x7, x7, x19
ldp x18, x19, [x16, 32]
eor x12, x12, x18
ldp x20, x19, [x16, 32]
eor x12, x12, x20
eor x13, x13, x19
ldp x18, x19, [x16, 48]
eor x14, x14, x18
ldp x20, x19, [x16, 48]
eor x14, x14, x20
eor x15, x15, x19
# Load group F registers (spAddr1)
ldpsw x18, x19, [x17]
ins v16.d[0], x18
ldpsw x20, x19, [x17]
ins v16.d[0], x20
ins v16.d[1], x19
ldpsw x18, x19, [x17, 8]
ins v17.d[0], x18
ldpsw x20, x19, [x17, 8]
ins v17.d[0], x20
ins v17.d[1], x19
ldpsw x18, x19, [x17, 16]
ins v18.d[0], x18
ldpsw x20, x19, [x17, 16]
ins v18.d[0], x20
ins v18.d[1], x19
ldpsw x18, x19, [x17, 24]
ins v19.d[0], x18
ldpsw x20, x19, [x17, 24]
ins v19.d[0], x20
ins v19.d[1], x19
scvtf v16.2d, v16.2d
scvtf v17.2d, v17.2d
@ -240,17 +239,17 @@ DECL(randomx_program_aarch64_main_loop):
scvtf v19.2d, v19.2d
# Load group E registers (spAddr1)
ldpsw x18, x19, [x17, 32]
ins v20.d[0], x18
ldpsw x20, x19, [x17, 32]
ins v20.d[0], x20
ins v20.d[1], x19
ldpsw x18, x19, [x17, 40]
ins v21.d[0], x18
ldpsw x20, x19, [x17, 40]
ins v21.d[0], x20
ins v21.d[1], x19
ldpsw x18, x19, [x17, 48]
ins v22.d[0], x18
ldpsw x20, x19, [x17, 48]
ins v22.d[0], x20
ins v22.d[1], x19
ldpsw x18, x19, [x17, 56]
ins v23.d[0], x18
ldpsw x20, x19, [x17, 56]
ins v23.d[0], x20
ins v23.d[1], x19
scvtf v20.2d, v20.2d
scvtf v21.2d, v21.2d
@ -273,7 +272,6 @@ DECL(randomx_program_aarch64_vm_instructions):
literal_x0: .fill 1,8,0
literal_x11: .fill 1,8,0
literal_x20: .fill 1,8,0
literal_x21: .fill 1,8,0
literal_x22: .fill 1,8,0
literal_x23: .fill 1,8,0
@ -309,17 +307,17 @@ DECL(randomx_program_aarch64_vm_instructions_end):
lsr x10, x9, 32
# mx ^= r[readReg2] ^ r[readReg3];
eor x9, x9, x18
eor x9, x9, x20
# Calculate dataset pointer for dataset prefetch
mov w18, w9
mov w20, w9
DECL(randomx_program_aarch64_cacheline_align_mask1):
# Actual mask will be inserted by JIT compiler
and x18, x18, 1
add x18, x18, x1
and x20, x20, 1
add x20, x20, x1
# Prefetch dataset data
prfm pldl2strm, [x18]
prfm pldl2strm, [x20]
# mx <-> ma
ror x9, x9, 32
@ -331,17 +329,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):
DECL(randomx_program_aarch64_xor_with_dataset_line):
# xor integer registers with dataset data
ldp x18, x19, [x10]
eor x4, x4, x18
ldp x20, x19, [x10]
eor x4, x4, x20
eor x5, x5, x19
ldp x18, x19, [x10, 16]
eor x6, x6, x18
ldp x20, x19, [x10, 16]
eor x6, x6, x20
eor x7, x7, x19
ldp x18, x19, [x10, 32]
eor x12, x12, x18
ldp x20, x19, [x10, 32]
eor x12, x12, x20
eor x13, x13, x19
ldp x18, x19, [x10, 48]
eor x14, x14, x18
ldp x20, x19, [x10, 48]
eor x14, x14, x20
eor x15, x15, x19
DECL(randomx_program_aarch64_update_spMix1):
@ -384,7 +382,7 @@ DECL(randomx_program_aarch64_update_spMix1):
# Restore callee-saved registers
ldp x16, x17, [sp]
ldp x18, x19, [sp, 16]
ldr x19, [sp, 16]
ldp x20, x21, [sp, 32]
ldp x22, x23, [sp, 48]
ldp x24, x25, [sp, 64]
@ -405,7 +403,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light):
stp x2, x30, [sp, 80]
# mx ^= r[readReg2] ^ r[readReg3];
eor x9, x9, x18
eor x9, x9, x20
# mx <-> ma
ror x9, x9, 32
@ -447,8 +445,8 @@ DECL(randomx_program_aarch64_light_dataset_offset):
# x3 -> end item
DECL(randomx_init_dataset_aarch64):
# Save x30 (return address)
str x30, [sp, -16]!
# Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address)
stp x20, x30, [sp, -16]!
# Load pointer to cache memory
ldr x0, [x0]
@ -460,8 +458,8 @@ DECL(randomx_init_dataset_aarch64_main_loop):
cmp x2, x3
bne DECL(randomx_init_dataset_aarch64_main_loop)
# Restore x30 (return address)
ldr x30, [sp], 16
# Restore x20 and x30
ldp x20, x30, [sp], 16
ret