REDACTED-rig/src/backend/opencl/cl/rx/randomx_run_gfx803.asm

/*
Copyright (c) 2019 SChernykh

This file is part of RandomX OpenCL.

RandomX OpenCL is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

RandomX OpenCL is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
*/

.amdcl2
.gpu GFX803
.64bit
.arch_minor 0
.arch_stepping 0
.driver_version 203603
.kernel randomx_run
	.config
		.dims x
		.cws 64, 1, 1
		.sgprsnum 96
		# 6 waves per SIMD: 37-40 VGPRs
		# 5 waves per SIMD: 41-48 VGPRs
		# 4 waves per SIMD: 49-64 VGPRs
		# 3 waves per SIMD: 65-84 VGPRs
		# 2 waves per SIMD: 85-128 VGPRs
		# 1 wave  per SIMD: 129-256 VGPRs
		.vgprsnum 128
		.localsize 256
		.floatmode 0xc0
		.pgmrsrc1 0x00ac035f
		.pgmrsrc2 0x0000008c
		.dx10clamp
		.ieeemode
		.useargs
		.priority 0
		.arg _.global_offset_0, "size_t", long
		.arg _.global_offset_1, "size_t", long
		.arg _.global_offset_2, "size_t", long
		.arg _.printf_buffer, "size_t", void*, global, , rdonly
		.arg _.vqueue_pointer, "size_t", long
		.arg _.aqlwrap_pointer, "size_t", long
		.arg dataset, "uchar*", uchar*, global, const, rdonly
		.arg scratchpad, "uchar*", uchar*, global,
		.arg registers, "ulong*", ulong*, global,
		.arg rounding_modes, "uint*", uint*, global,
		.arg programs, "uint*", uint*, global,
		.arg batch_size, "uint", uint
		.arg rx_parameters, "uint", uint
	.text
		s_mov_b32       m0, 0x10000
		s_dcache_wb
		s_waitcnt       vmcnt(0) & lgkmcnt(0)
		s_icache_inv
		s_branch begin

		# pgmrsrc2 = 0x00000090, bits 1:5 = 8, so first 8 SGPRs (s0-s7) contain user data
		# s8 contains group id
		# v0 contains local id
begin:
		s_mov_b32 s8, s6
		v_lshlrev_b32   v1, 6, s8
		v_add_u32       v1, vcc, v1, v0
		s_load_dwordx2  s[0:1], s[4:5], 0x0
		s_load_dwordx2  s[2:3], s[4:5], 0x40
		s_load_dwordx2  s[64:65], s[4:5], 0x48
		s_waitcnt       lgkmcnt(0)

		# load rounding mode
		s_lshl_b32      s16, s8, 2
		s_add_u32       s64, s64, s16
		s_addc_u32      s65, s65, 0
		v_mov_b32       v8, s64
		v_mov_b32       v9, s65
		flat_load_dword v8, v[8:9]
		s_waitcnt       vmcnt(0)
		v_readlane_b32  s66, v8, 0
		s_setreg_b32    hwreg(mode, 2, 2), s66
		s_mov_b32       s67, 0

		# used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64)
		s_mov_b32       s68, 256
		s_mov_b32       s69, 0

		v_add_u32       v1, vcc, s0, v1
		v_lshrrev_b32   v2, 6, v1
		v_lshlrev_b32   v3, 5, v2
		v_and_b32       v1, 63, v1
		v_mov_b32       v4, 0
		v_lshlrev_b64   v[3:4], 3, v[3:4]
		v_lshlrev_b32   v5, 4, v1
		v_add_u32       v3, vcc, s2, v3
		v_mov_b32       v6, s3
		v_addc_u32      v4, vcc, v6, v4, vcc
		v_lshlrev_b32   v41, 2, v1
		v_add_u32       v6, vcc, v3, v41
		v_addc_u32      v7, vcc, v4, 0, vcc
		flat_load_dword v6, v[6:7]
		v_mov_b32       v0, 0
		s_waitcnt       vmcnt(0)
		ds_write_b32    v41, v6
		s_waitcnt       lgkmcnt(0)
		s_mov_b64       s[0:1], exec
		v_cmpx_le_u32   s[2:3], v1, 7
		s_cbranch_execz program_end

		# rx_parameters
		s_load_dword    s20, s[4:5], 0x5c
		s_waitcnt       lgkmcnt(0)

		# Scratchpad L1 size
		s_bfe_u32       s21, s20, 0x050000
		s_lshl_b32      s21, 1, s21

		# Scratchpad L2 size
		s_bfe_u32       s22, s20, 0x050005
		s_lshl_b32      s22, 1, s22

		# Scratchpad L3 size
		s_bfe_u32       s23, s20, 0x05000A
		s_lshl_b32      s23, 1, s23

		# program iterations
		s_bfe_u32       s24, s20, 0x04000F
		s_lshl_b32      s24, 1, s24

		# Base address for scratchpads
		s_add_u32       s2, s23, 64
		v_mul_hi_u32    v20, v2, s2
		v_mul_lo_u32    v2, v2, s2

		# v41, v44 = 0
		v_mov_b32       v41, 0
		v_mov_b32       v44, 0

		ds_read_b32     v6, v0 offset:152
		v_cmp_lt_u32    s[2:3], v1, 4
		ds_read2_b64    v[34:37], v0 offset0:18 offset1:16
		ds_read_b64     v[11:12], v0 offset:136
		s_movk_i32      s9, 0x0
		s_mov_b64       s[6:7], exec
		s_andn2_b64     exec, s[6:7], s[2:3]
		ds_read_b64     v[13:14], v0 offset:160
		s_andn2_b64     exec, s[6:7], exec
		v_mov_b32       v13, 0
		v_mov_b32       v14, 0
		s_mov_b64       exec, s[6:7]

		# compiled program size
		s_mov_b64       s[6:7], s[8:9]
		s_mulk_i32      s6, 10048

		v_add_u32       v5, vcc, v0, v5
		v_add_u32       v5, vcc, v5, 64
		s_mov_b64       s[8:9], exec
		s_andn2_b64     exec, s[8:9], s[2:3]
		ds_read_b64     v[15:16], v0 offset:168
		s_andn2_b64     exec, s[8:9], exec
		v_mov_b32       v15, 0
		v_mov_b32       v16, 0
		s_mov_b64       exec, s[8:9]
		s_load_dwordx4  s[8:11], s[4:5], 0x30

		# batch_size
		s_load_dword    s16, s[4:5], 0x58

		s_load_dwordx2  s[4:5], s[4:5], 0x50
		v_lshlrev_b32   v1, 3, v1
		v_add_u32       v17, vcc, v0, v1
		s_waitcnt       lgkmcnt(0)
		v_add_u32       v2, vcc, s10, v2
		v_mov_b32       v18, s11
		v_addc_u32      v18, vcc, v18, v20, vcc
		v_mov_b32       v19, 0xffffff
		v_add_u32       v6, vcc, s8, v6
		v_mov_b32       v20, s9
		v_addc_u32      v20, vcc, v20, 0, vcc
		ds_read_b64     v[21:22], v17
		s_add_u32       s4, s4, s6
		s_addc_u32      s5, s5, s7
		v_cndmask_b32   v19, v19, -1, s[2:3]
		v_lshlrev_b32   v8, 3, v35
		v_lshlrev_b32   v7, 3, v34
		v_lshlrev_b32   v12, 3, v12
		v_lshlrev_b32   v10, 3, v11
		v_add_u32       v8, vcc, v8, v0
		v_add_u32       v7, vcc, v7, v0
		v_add_u32       v12, vcc, v12, v0
		v_add_u32       v0, vcc, v10, v0
		v_mov_b32       v10, v36
		v_mov_b32       v23, v37

		# loop counter
		s_sub_u32       s2, s24, 1

		# batch_size
		s_mov_b32       s3, s16

		# Scratchpad masks for scratchpads
		v_sub_u32       v38, vcc, s21, 8
		v_sub_u32       v39, vcc, s22, 8
		v_sub_u32       v50, vcc, s23, 8

		# mask for FSCAL_R
		v_mov_b32       v51, 0x80F00000

		# swap v3 and v18
		v_mov_b32       v52, v3
		v_mov_b32       v3, v18
		v_mov_b32       v18, v52

		# load scratchpad base address
		v_readlane_b32	s0, v2, 0
		v_readlane_b32	s1, v3, 0

		# save current executiom mask
		s_mov_b64       s[36:37], exec

		# v41 = 0 on lane 0, set it to 8 on lane 1
		# v44 = 0 on lane 0, set it to 4 on lane 1
		s_mov_b64       exec, 2
		v_mov_b32       v41, 8
		v_mov_b32       v44, 4

		# load group A registers
		# Read low 8 bytes into lane 0 and high 8 bytes into lane 1
		s_mov_b64       exec, 3
		ds_read2_b64    v[52:55], v41 offset0:24 offset1:26
		ds_read2_b64    v[56:59], v41 offset0:28 offset1:30

		# xmantissaMask
		v_mov_b32       v77, (1 << 24) - 1

		# xexponentMask
		ds_read_b64     v[78:79], v41 offset:160

		# Restore execution mask
		s_mov_b64       exec, s[36:37]

		# sign mask (used in FSQRT_R)
		v_mov_b32       v82, 0x80000000

		# High 32 bits of "1.0" constant (used in FDIV_M)
		v_mov_b32       v83, (1023 << 20)

		# Used to multiply FP64 values by 0.5
		v_mov_b32       v84, (1 << 20)

		s_getpc_b64 s[14:15]
cur_addr:

		# get addresses of FSQRT_R subroutines
		s_add_u32       s40, s14, fsqrt_r_sub0 - cur_addr
		s_addc_u32      s41, s15, 0
		s_add_u32       s42, s14, fsqrt_r_sub1 - cur_addr
		s_addc_u32      s43, s15, 0
		s_add_u32       s44, s14, fsqrt_r_sub2 - cur_addr
		s_addc_u32      s45, s15, 0
		s_add_u32       s46, s14, fsqrt_r_sub3 - cur_addr
		s_addc_u32      s47, s15, 0

		# get addresses of FDIV_M subroutines
		s_add_u32       s48, s14, fdiv_m_sub0 - cur_addr
		s_addc_u32      s49, s15, 0
		s_add_u32       s50, s14, fdiv_m_sub1 - cur_addr
		s_addc_u32      s51, s15, 0
		s_add_u32       s52, s14, fdiv_m_sub2 - cur_addr
		s_addc_u32      s53, s15, 0
		s_add_u32       s54, s14, fdiv_m_sub3 - cur_addr
		s_addc_u32      s55, s15, 0

		# get address for ISMULH_R subroutine
		s_add_u32       s56, s14, ismulh_r_sub - cur_addr
		s_addc_u32      s57, s15, 0

		# get address for IMULH_R subroutine
		s_add_u32       s58, s14, imulh_r_sub - cur_addr
		s_addc_u32      s59, s15, 0

		# used in IXOR_R instruction
		s_mov_b32       s63, -1

		# used in CBRANCH instruction
		s_mov_b32       s70, (0xFF << 8)
		s_mov_b32       s71, (0xFF << 9)
		s_mov_b32       s72, (0xFF << 10)
		s_mov_b32       s73, (0xFF << 11)
		s_mov_b32       s74, (0xFF << 12)
		s_mov_b32       s75, (0xFF << 13)
		s_mov_b32       s76, (0xFF << 14)
		s_mov_b32       s77, (0xFF << 15)
		s_mov_b32       s78, (0xFF << 16)
		s_mov_b32       s79, (0xFF << 17)
		s_mov_b32       s80, (0xFF << 18)
		s_mov_b32       s81, (0xFF << 19)
		s_mov_b32       s82, (0xFF << 20)
		s_mov_b32       s83, (0xFF << 21)
		s_mov_b32       s84, (0xFF << 22)
		s_mov_b32       s85, (0xFF << 23)

		# ScratchpadL3Mask64
		s_sub_u32       s86, s23, 64

main_loop:
		# const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]);
		ds_read_b64     v[24:25], v0
		ds_read_b64     v[26:27], v12
		s_waitcnt       lgkmcnt(0)
		v_xor_b32       v25, v27, v25
		v_xor_b32       v24, v26, v24

		# spAddr1 ^= spMix.y;
		# spAddr0 ^= spMix.x;
		v_xor_b32       v10, v25, v10
		v_xor_b32       v23, v24, v23

		# spAddr1 &= ScratchpadL3Mask64;
		# spAddr0 &= ScratchpadL3Mask64;
		v_and_b32       v10, s86, v10
		v_and_b32       v23, s86, v23

		# Offset for scratchpads
		# offset1 = spAddr1 + sub * 8
		# offset0 = spAddr0 + sub * 8
		v_add_u32       v10, vcc, v10, v1
		v_add_u32       v23, vcc, v23, v1

		# __global ulong* p1 = (__global ulong*)(scratchpad + offset1);
		# __global ulong* p0 = (__global ulong*)(scratchpad + offset0);
		v_add_u32       v26, vcc, v2, v10
		v_addc_u32      v27, vcc, v3, 0, vcc
		v_add_u32       v23, vcc, v2, v23
		v_addc_u32      v24, vcc, v3, 0, vcc

		# load from spAddr1
		flat_load_dwordx2 v[28:29], v[26:27]

		# load from spAddr0
		flat_load_dwordx2 v[30:31], v[23:24]
		s_waitcnt       vmcnt(1)

		v_cvt_f64_i32   v[32:33], v28
		v_cvt_f64_i32   v[28:29], v29
		s_waitcnt       vmcnt(0)

		# R[sub] ^= *p0;
		v_xor_b32       v34, v21, v30
		v_xor_b32       v35, v22, v31

		v_add_u32       v22, vcc, v6, v36
		v_addc_u32      v25, vcc, v20, 0, vcc
		v_add_u32       v21, vcc, v22, v1
		v_addc_u32      v22, vcc, v25, 0, vcc
		flat_load_dwordx2 v[21:22], v[21:22]
		v_or_b32        v30, v32, v13
		v_and_b32       v31, v33, v19
		v_or_b32        v31, v31, v14
		v_or_b32        v28, v28, v15
		v_and_b32       v29, v29, v19
		v_or_b32        v29, v29, v16
		ds_write2_b64   v5, v[30:31], v[28:29] offset1:1
		s_waitcnt       lgkmcnt(0)

		# Program 0

		# load group F,E registers
		# Read low 8 bytes into lane 0 and high 8 bytes into lane 1
		s_mov_b64       exec, 3
		ds_read2_b64    v[60:63], v41 offset0:8 offset1:10
		ds_read2_b64    v[64:67], v41 offset0:12 offset1:14
		ds_read2_b64    v[68:71], v41 offset0:16 offset1:18
		ds_read2_b64    v[72:75], v41 offset0:20 offset1:22

		# load VM integer registers
		v_readlane_b32	s16, v34, 0
		v_readlane_b32	s17, v35, 0
		v_readlane_b32	s18, v34, 1
		v_readlane_b32	s19, v35, 1
		v_readlane_b32	s20, v34, 2
		v_readlane_b32	s21, v35, 2
		v_readlane_b32	s22, v34, 3
		v_readlane_b32	s23, v35, 3
		v_readlane_b32	s24, v34, 4
		v_readlane_b32	s25, v35, 4
		v_readlane_b32	s26, v34, 5
		v_readlane_b32	s27, v35, 5
		v_readlane_b32	s28, v34, 6
		v_readlane_b32	s29, v35, 6
		v_readlane_b32	s30, v34, 7
		v_readlane_b32	s31, v35, 7

		s_waitcnt       lgkmcnt(0)

		# call JIT code
		s_swappc_b64    s[12:13], s[4:5]

		# Write out group F,E registers
		# Write low 8 bytes from lane 0 and high 8 bytes from lane 1
		ds_write2_b64   v41, v[60:61], v[62:63] offset0:8 offset1:10
		ds_write2_b64   v41, v[64:65], v[66:67] offset0:12 offset1:14
		ds_write2_b64   v41, v[68:69], v[70:71] offset0:16 offset1:18
		ds_write2_b64   v41, v[72:73], v[74:75] offset0:20 offset1:22

		# store VM integer registers
		v_writelane_b32 v28, s16, 0
		v_writelane_b32 v29, s17, 0
		v_writelane_b32 v28, s18, 1
		v_writelane_b32 v29, s19, 1
		v_writelane_b32 v28, s20, 2
		v_writelane_b32 v29, s21, 2
		v_writelane_b32 v28, s22, 3
		v_writelane_b32 v29, s23, 3
		v_writelane_b32 v28, s24, 4
		v_writelane_b32 v29, s25, 4
		v_writelane_b32 v28, s26, 5
		v_writelane_b32 v29, s27, 5
		v_writelane_b32 v28, s28, 6
		v_writelane_b32 v29, s29, 6
		v_writelane_b32 v28, s30, 7
		v_writelane_b32 v29, s31, 7

		# Restore execution mask
		s_mov_b64       exec, s[36:37]

		# Write out VM integer registers
		ds_write_b64    v17, v[28:29]

		s_waitcnt       lgkmcnt(0)
		v_xor_b32       v21, v28, v21
		v_xor_b32       v22, v29, v22
		ds_read_b32     v28, v7
		ds_read_b32     v29, v8
		ds_write_b64    v17, v[21:22]
		s_waitcnt       lgkmcnt(1)
		ds_read2_b64    v[30:33], v17 offset0:8 offset1:16
		v_xor_b32       v10, v28, v37
		s_waitcnt       lgkmcnt(0)
		v_xor_b32       v30, v32, v30
		v_xor_b32       v31, v33, v31
		v_xor_b32       v10, v10, v29
		flat_store_dwordx2 v[26:27], v[21:22]
		v_and_b32       v10, 0x7fffffc0, v10
		flat_store_dwordx2 v[23:24], v[30:31]
		s_cmp_eq_u32    s2, 0
		s_cbranch_scc1  main_loop_end
		s_sub_i32       s2, s2, 1
		v_mov_b32       v37, v36
		v_mov_b32       v23, 0
		v_mov_b32       v36, v10
		v_mov_b32       v10, 0
		s_branch        main_loop
main_loop_end:

		v_add_u32       v0, vcc, v18, v1
		v_addc_u32      v1, vcc, v4, 0, vcc
		flat_store_dwordx2 v[0:1], v[21:22]
		v_add_u32       v0, vcc, v0, 64
		v_addc_u32      v1, vcc, v1, 0, vcc
		flat_store_dwordx2 v[0:1], v[30:31]
		v_add_u32       v0, vcc, v0, 64
		v_addc_u32      v1, vcc, v1, 0, vcc
		flat_store_dwordx2 v[0:1], v[32:33]

		# store rounding mode
		v_mov_b32       v0, s64
		v_mov_b32       v1, s65
		v_mov_b32       v2, s66
		flat_store_dword v[0:1], v2

program_end:
		s_endpgm

fsqrt_r_sub0:
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rsq_f64       v[28:29], v[68:69]

		# Improve initial approximation (can be skipped)
		#v_mul_f64       v[42:43], v[28:29], v[68:69]
		#v_mul_f64       v[48:49], v[28:29], -0.5
		#v_fma_f64       v[48:49], v[48:49], v[42:43], 0.5
		#v_fma_f64       v[28:29], v[28:29], v[48:49], v[28:29]

		v_mul_f64       v[42:43], v[28:29], v[68:69]
		v_mov_b32       v48, v28
		v_sub_u32       v49, vcc, v29, v84
		v_mov_b32       v46, v28
		v_xor_b32       v47, v49, v82
		v_fma_f64       v[46:47], v[46:47], v[42:43], 0.5
		v_fma_f64       v[42:43], v[42:43], v[46:47], v[42:43]
		v_fma_f64       v[48:49], v[48:49], v[46:47], v[48:49]
		v_fma_f64       v[46:47], -v[42:43], v[42:43], v[68:69]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64       v[42:43], v[46:47], v[48:49], v[42:43]
		v_cmpx_class_f64 s[14:15], v[68:69], s[68:69]
		v_mov_b32       v68, v42
		v_mov_b32       v69, v43
		s_mov_b64       exec, 3
		s_setpc_b64     s[60:61]

fsqrt_r_sub1:
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rsq_f64       v[28:29], v[70:71]

		# Improve initial approximation (can be skipped)
		#v_mul_f64       v[42:43], v[28:29], v[70:71]
		#v_mul_f64       v[48:49], v[28:29], -0.5
		#v_fma_f64       v[48:49], v[48:49], v[42:43], 0.5
		#v_fma_f64       v[28:29], v[28:29], v[48:49], v[28:29]

		v_mul_f64       v[42:43], v[28:29], v[70:71]
		v_mov_b32       v48, v28
		v_sub_u32       v49, vcc, v29, v84
		v_mov_b32       v46, v28
		v_xor_b32       v47, v49, v82
		v_fma_f64       v[46:47], v[46:47], v[42:43], 0.5
		v_fma_f64       v[42:43], v[42:43], v[46:47], v[42:43]
		v_fma_f64       v[48:49], v[48:49], v[46:47], v[48:49]
		v_fma_f64       v[46:47], -v[42:43], v[42:43], v[70:71]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64       v[42:43], v[46:47], v[48:49], v[42:43]
		v_cmpx_class_f64 s[14:15], v[70:71], s[68:69]
		v_mov_b32       v70, v42
		v_mov_b32       v71, v43
		s_mov_b64       exec, 3
		s_setpc_b64     s[60:61]

fsqrt_r_sub2:
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rsq_f64       v[28:29], v[72:73]

		# Improve initial approximation (can be skipped)
		#v_mul_f64       v[42:43], v[28:29], v[72:73]
		#v_mul_f64       v[48:49], v[28:29], -0.5
		#v_fma_f64       v[48:49], v[48:49], v[42:43], 0.5
		#v_fma_f64       v[28:29], v[28:29], v[48:49], v[28:29]

		v_mul_f64       v[42:43], v[28:29], v[72:73]
		v_mov_b32       v48, v28
		v_sub_u32       v49, vcc, v29, v84
		v_mov_b32       v46, v28
		v_xor_b32       v47, v49, v82
		v_fma_f64       v[46:47], v[46:47], v[42:43], 0.5
		v_fma_f64       v[42:43], v[42:43], v[46:47], v[42:43]
		v_fma_f64       v[48:49], v[48:49], v[46:47], v[48:49]
		v_fma_f64       v[46:47], -v[42:43], v[42:43], v[72:73]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64       v[42:43], v[46:47], v[48:49], v[42:43]
		v_cmpx_class_f64 s[14:15], v[72:73], s[68:69]
		v_mov_b32       v72, v42
		v_mov_b32       v73, v43
		s_mov_b64       exec, 3
		s_setpc_b64     s[60:61]

fsqrt_r_sub3:
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rsq_f64       v[28:29], v[74:75]

		# Improve initial approximation (can be skipped)
		#v_mul_f64       v[42:43], v[28:29], v[74:75]
		#v_mul_f64       v[48:49], v[28:29], -0.5
		#v_fma_f64       v[48:49], v[48:49], v[42:43], 0.5
		#v_fma_f64       v[28:29], v[28:29], v[48:49], v[28:29]

		v_mul_f64       v[42:43], v[28:29], v[74:75]
		v_mov_b32       v48, v28
		v_sub_u32       v49, vcc, v29, v84
		v_mov_b32       v46, v28
		v_xor_b32       v47, v49, v82
		v_fma_f64       v[46:47], v[46:47], v[42:43], 0.5
		v_fma_f64       v[42:43], v[42:43], v[46:47], v[42:43]
		v_fma_f64       v[48:49], v[48:49], v[46:47], v[48:49]
		v_fma_f64       v[46:47], -v[42:43], v[42:43], v[74:75]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64       v[42:43], v[46:47], v[48:49], v[42:43]
		v_cmpx_class_f64 s[14:15], v[74:75], s[68:69]
		v_mov_b32       v74, v42
		v_mov_b32       v75, v43
		s_mov_b64       exec, 3
		s_setpc_b64     s[60:61]

fdiv_m_sub0:
		v_or_b32        v28, v28, v78
		v_and_b32       v29, v29, v77
		v_or_b32        v29, v29, v79
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rcp_f64       v[48:49], v[28:29]
		v_fma_f64       v[80:81], -v[28:29], v[48:49], 1.0
		v_fma_f64       v[48:49], v[48:49], v[80:81], v[48:49]
		v_mul_f64       v[80:81], v[68:69], v[48:49]
		v_fma_f64       v[42:43], -v[28:29], v[80:81], v[68:69]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64  v[42:43], v[42:43], v[48:49], v[80:81]
		v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69]
		v_cmpx_eq_f64   s[14:15], v[68:69], v[28:29]
		v_mov_b32 v80, 0
		v_mov_b32 v81, v83
		s_mov_b64       exec, 3
		v_mov_b32       v68, v80
		v_mov_b32       v69, v81
		s_setpc_b64     s[60:61]

fdiv_m_sub1:
		v_or_b32        v28, v28, v78
		v_and_b32       v29, v29, v77
		v_or_b32        v29, v29, v79
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rcp_f64       v[48:49], v[28:29]
		v_fma_f64       v[80:81], -v[28:29], v[48:49], 1.0
		v_fma_f64       v[48:49], v[48:49], v[80:81], v[48:49]
		v_mul_f64       v[80:81], v[70:71], v[48:49]
		v_fma_f64       v[42:43], -v[28:29], v[80:81], v[70:71]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64  v[42:43], v[42:43], v[48:49], v[80:81]
		v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71]
		v_cmpx_eq_f64   s[14:15], v[70:71], v[28:29]
		v_mov_b32 v80, 0
		v_mov_b32 v81, v83
		s_mov_b64       exec, 3
		v_mov_b32       v70, v80
		v_mov_b32       v71, v81
		s_setpc_b64     s[60:61]

fdiv_m_sub2:
		v_or_b32        v28, v28, v78
		v_and_b32       v29, v29, v77
		v_or_b32        v29, v29, v79
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rcp_f64       v[48:49], v[28:29]
		v_fma_f64       v[80:81], -v[28:29], v[48:49], 1.0
		v_fma_f64       v[48:49], v[48:49], v[80:81], v[48:49]
		v_mul_f64       v[80:81], v[72:73], v[48:49]
		v_fma_f64       v[42:43], -v[28:29], v[80:81], v[72:73]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64  v[42:43], v[42:43], v[48:49], v[80:81]
		v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73]
		v_cmpx_eq_f64   s[14:15], v[72:73], v[28:29]
		v_mov_b32 v80, 0
		v_mov_b32 v81, v83
		s_mov_b64       exec, 3
		v_mov_b32       v72, v80
		v_mov_b32       v73, v81
		s_setpc_b64     s[60:61]

fdiv_m_sub3:
		v_or_b32        v28, v28, v78
		v_and_b32       v29, v29, v77
		v_or_b32        v29, v29, v79
		s_setreg_b32    hwreg(mode, 2, 2), s67
		v_rcp_f64       v[48:49], v[28:29]
		v_fma_f64       v[80:81], -v[28:29], v[48:49], 1.0
		v_fma_f64       v[48:49], v[48:49], v[80:81], v[48:49]
		v_mul_f64       v[80:81], v[74:75], v[48:49]
		v_fma_f64       v[42:43], -v[28:29], v[80:81], v[74:75]
		s_setreg_b32    hwreg(mode, 2, 2), s66
		v_fma_f64  v[42:43], v[42:43], v[48:49], v[80:81]
		v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75]
		v_cmpx_eq_f64   s[14:15], v[74:75], v[28:29]
		v_mov_b32 v80, 0
		v_mov_b32 v81, v83
		s_mov_b64       exec, 3
		v_mov_b32       v74, v80
		v_mov_b32       v75, v81
		s_setpc_b64     s[60:61]

ismulh_r_sub:
		s_mov_b64       exec, 1
		v_mov_b32       v45, s14
		v_mul_hi_u32    v40, s38, v45
		v_mov_b32       v47, s15
		v_mad_u64_u32   v[42:43], s[32:33], s38, v47, v[40:41]
		v_mov_b32       v40, v42
		v_mad_u64_u32   v[45:46], s[32:33], s39, v45, v[40:41]
		v_mad_u64_u32   v[42:43], s[32:33], s39, v47, v[43:44]
		v_add_u32       v42, vcc, v42, v46
		v_addc_u32      v43, vcc, 0, v43, vcc
		v_readlane_b32  s32, v42, 0
		v_readlane_b32  s33, v43, 0
		s_cmp_lt_i32    s15, 0
		s_cselect_b64   s[34:35], s[38:39], 0
		s_sub_u32       s32, s32, s34
		s_subb_u32      s33, s33, s35
		s_cmp_lt_i32    s39, 0
		s_cselect_b64   s[34:35], s[14:15], 0
		s_sub_u32       s14, s32, s34
		s_subb_u32      s15, s33, s35
		s_mov_b64       exec, 3
		s_setpc_b64     s[60:61]

imulh_r_sub:
		s_mov_b64       exec, 1
		v_mov_b32       v45, s38
		v_mul_hi_u32    v40, s14, v45
		v_mov_b32       v47, s39
		v_mad_u64_u32   v[42:43], s[32:33], s14, v47, v[40:41]
		v_mov_b32       v40, v42
		v_mad_u64_u32   v[45:46], s[32:33], s15, v45, v[40:41]
		v_mad_u64_u32   v[42:43], s[32:33], s15, v47, v[43:44]
		v_add_u32       v42, vcc, v42, v46
		v_addc_u32      v43, vcc, 0, v43, vcc
		v_readlane_b32  s14, v42, 0
		v_readlane_b32  s15, v43, 0
		s_mov_b64       exec, 3
		s_setpc_b64     s[60:61]