ARM64 JIT: don't use x18 register

From https://developer.apple.com/documentation/xcode/writing-arm64-code-for-apple-platforms > The platforms reserve register x18. Don’t use this register. This PR fixes invalid hashes when running on Apple silicon with the latest macOS SDK.
2023-10-19 17:39:25 +02:00 · 2023-10-19 17:39:25 +02:00 · 5e66efabcf
commit 5e66efabcf
parent 08901a9a4b
2 changed files with 75 additions and 77 deletions
--- a/src/crypto/randomx/jit_compiler_a64_static.S
+++ b/src/crypto/randomx/jit_compiler_a64_static.S
@ -72,9 +72,9 @@
 # x15 -> "r7"
 # x16 -> spAddr0
 # x17 -> spAddr1
-# x18 -> temporary
+# x18 -> unused (platform register, don't touch it)
 # x19 -> temporary
-# x20 -> literal for IMUL_RCP
+# x20 -> temporary
 # x21 -> literal for IMUL_RCP
 # x22 -> literal for IMUL_RCP
 # x23 -> literal for IMUL_RCP
@ -109,7 +109,7 @@ DECL(randomx_program_aarch64):
 	# Save callee-saved registers
 	sub	sp, sp, 192
 	stp	x16, x17, [sp]
-	stp	x18, x19, [sp, 16]
+	str	x19, [sp, 16]
 	stp	x20, x21, [sp, 32]
 	stp	x22, x23, [sp, 48]
 	stp	x24, x25, [sp, 64]
@ -164,7 +164,6 @@ DECL(randomx_program_aarch64):
 	# Read literals
 	ldr	x0, literal_x0
 	ldr	x11, literal_x11
-	ldr	x20, literal_x20
 	ldr	x21, literal_x21
 	ldr	x22, literal_x22
 	ldr	x23, literal_x23
@ -196,11 +195,11 @@ DECL(randomx_program_aarch64):
 DECL(randomx_program_aarch64_main_loop):
 	# spAddr0 = spMix1 & ScratchpadL3Mask64;
 	# spAddr1 = (spMix1 >> 32) & ScratchpadL3Mask64;
-	lsr	x18, x10, 32
+	lsr	x20, x10, 32

 	# Actual mask will be inserted by JIT compiler
 	and	w16, w10, 1
-	and	w17, w18, 1
+	and	w17, w20, 1

 	# x16 = scratchpad + spAddr0
 	# x17 = scratchpad + spAddr1
@ -208,31 +207,31 @@ DECL(randomx_program_aarch64_main_loop):
 	add	x17, x17, x2

 	# xor integer registers with scratchpad data (spAddr0)
-	ldp	x18, x19, [x16]
-	eor	x4, x4, x18
+	ldp	x20, x19, [x16]
+	eor	x4, x4, x20
 	eor	x5, x5, x19
-	ldp	x18, x19, [x16, 16]
-	eor	x6, x6, x18
+	ldp	x20, x19, [x16, 16]
+	eor	x6, x6, x20
 	eor	x7, x7, x19
-	ldp	x18, x19, [x16, 32]
-	eor	x12, x12, x18
+	ldp	x20, x19, [x16, 32]
+	eor	x12, x12, x20
 	eor	x13, x13, x19
-	ldp	x18, x19, [x16, 48]
-	eor	x14, x14, x18
+	ldp	x20, x19, [x16, 48]
+	eor	x14, x14, x20
 	eor	x15, x15, x19

 	# Load group F registers (spAddr1)
-	ldpsw	x18, x19, [x17]
-	ins	v16.d[0], x18
+	ldpsw	x20, x19, [x17]
+	ins	v16.d[0], x20
 	ins	v16.d[1], x19
-	ldpsw	x18, x19, [x17, 8]
-	ins	v17.d[0], x18
+	ldpsw	x20, x19, [x17, 8]
+	ins	v17.d[0], x20
 	ins	v17.d[1], x19
-	ldpsw	x18, x19, [x17, 16]
-	ins	v18.d[0], x18
+	ldpsw	x20, x19, [x17, 16]
+	ins	v18.d[0], x20
 	ins	v18.d[1], x19
-	ldpsw	x18, x19, [x17, 24]
-	ins	v19.d[0], x18
+	ldpsw	x20, x19, [x17, 24]
+	ins	v19.d[0], x20
 	ins	v19.d[1], x19
 	scvtf	v16.2d, v16.2d
 	scvtf	v17.2d, v17.2d
@ -240,17 +239,17 @@ DECL(randomx_program_aarch64_main_loop):
 	scvtf	v19.2d, v19.2d

 	# Load group E registers (spAddr1)
-	ldpsw	x18, x19, [x17, 32]
-	ins	v20.d[0], x18
+	ldpsw	x20, x19, [x17, 32]
+	ins	v20.d[0], x20
 	ins	v20.d[1], x19
-	ldpsw	x18, x19, [x17, 40]
-	ins	v21.d[0], x18
+	ldpsw	x20, x19, [x17, 40]
+	ins	v21.d[0], x20
 	ins	v21.d[1], x19
-	ldpsw	x18, x19, [x17, 48]
-	ins	v22.d[0], x18
+	ldpsw	x20, x19, [x17, 48]
+	ins	v22.d[0], x20
 	ins	v22.d[1], x19
-	ldpsw	x18, x19, [x17, 56]
-	ins	v23.d[0], x18
+	ldpsw	x20, x19, [x17, 56]
+	ins	v23.d[0], x20
 	ins	v23.d[1], x19
 	scvtf	v20.2d, v20.2d
 	scvtf	v21.2d, v21.2d
@ -273,7 +272,6 @@ DECL(randomx_program_aarch64_vm_instructions):

 literal_x0:  .fill 1,8,0
 literal_x11: .fill 1,8,0
-literal_x20: .fill 1,8,0
 literal_x21: .fill 1,8,0
 literal_x22: .fill 1,8,0
 literal_x23: .fill 1,8,0
@ -309,17 +307,17 @@ DECL(randomx_program_aarch64_vm_instructions_end):
 	lsr	x10, x9, 32

 	# mx ^= r[readReg2] ^ r[readReg3];
-	eor	x9, x9, x18
+	eor	x9, x9, x20

 	# Calculate dataset pointer for dataset prefetch
-	mov	w18, w9
+	mov	w20, w9
 DECL(randomx_program_aarch64_cacheline_align_mask1):
 	# Actual mask will be inserted by JIT compiler
-	and	x18, x18, 1
-	add	x18, x18, x1
+	and	x20, x20, 1
+	add	x20, x20, x1

 	# Prefetch dataset data
-	prfm	pldl2strm, [x18]
+	prfm	pldl2strm, [x20]

 	# mx <-> ma
 	ror	x9, x9, 32
@ -331,17 +329,17 @@ DECL(randomx_program_aarch64_cacheline_align_mask2):

 DECL(randomx_program_aarch64_xor_with_dataset_line):
 	# xor integer registers with dataset data
-	ldp	x18, x19, [x10]
-	eor	x4, x4, x18
+	ldp	x20, x19, [x10]
+	eor	x4, x4, x20
 	eor	x5, x5, x19
-	ldp	x18, x19, [x10, 16]
-	eor	x6, x6, x18
+	ldp	x20, x19, [x10, 16]
+	eor	x6, x6, x20
 	eor	x7, x7, x19
-	ldp	x18, x19, [x10, 32]
-	eor	x12, x12, x18
+	ldp	x20, x19, [x10, 32]
+	eor	x12, x12, x20
 	eor	x13, x13, x19
-	ldp	x18, x19, [x10, 48]
-	eor	x14, x14, x18
+	ldp	x20, x19, [x10, 48]
+	eor	x14, x14, x20
 	eor	x15, x15, x19

 DECL(randomx_program_aarch64_update_spMix1):
@ -384,7 +382,7 @@ DECL(randomx_program_aarch64_update_spMix1):

 	# Restore callee-saved registers
 	ldp	x16, x17, [sp]
-	ldp	x18, x19, [sp, 16]
+	ldr	x19, [sp, 16]
 	ldp	x20, x21, [sp, 32]
 	ldp	x22, x23, [sp, 48]
 	ldp	x24, x25, [sp, 64]
@ -405,7 +403,7 @@ DECL(randomx_program_aarch64_vm_instructions_end_light):
 	stp	x2, x30, [sp, 80]

 	# mx ^= r[readReg2] ^ r[readReg3];
-	eor	x9, x9, x18
+	eor	x9, x9, x20

 	# mx <-> ma
 	ror	x9, x9, 32
@ -447,8 +445,8 @@ DECL(randomx_program_aarch64_light_dataset_offset):
 # x3 -> end item

 DECL(randomx_init_dataset_aarch64):
-	# Save x30 (return address)
-	str	x30, [sp, -16]!
+	# Save x20 (used as temporary, but must be saved to not break ABI) and x30 (return address)
+	stp	x20, x30, [sp, -16]!

 	# Load pointer to cache memory
 	ldr	x0, [x0]
@ -460,8 +458,8 @@ DECL(randomx_init_dataset_aarch64_main_loop):
 	cmp	x2, x3
 	bne	DECL(randomx_init_dataset_aarch64_main_loop)

-	# Restore x30 (return address)
-	ldr	x30, [sp], 16
+	# Restore x20 and x30
+	ldp	x20, x30, [sp], 16

 	ret