/* Copyright (c) 2019 SChernykh This file is part of RandomX OpenCL. RandomX OpenCL is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. RandomX OpenCL is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with RandomX OpenCL. If not, see . */ .amdcl2 .gpu GFX803 .64bit .arch_minor 0 .arch_stepping 0 .driver_version 203603 .kernel randomx_run .config .dims x .cws 64, 1, 1 .sgprsnum 96 # 6 waves per SIMD: 37-40 VGPRs # 5 waves per SIMD: 41-48 VGPRs # 4 waves per SIMD: 49-64 VGPRs # 3 waves per SIMD: 65-84 VGPRs # 2 waves per SIMD: 85-128 VGPRs # 1 wave per SIMD: 129-256 VGPRs .vgprsnum 128 .localsize 256 .floatmode 0xc0 .pgmrsrc1 0x00ac035f .pgmrsrc2 0x0000008c .dx10clamp .ieeemode .useargs .priority 0 .arg _.global_offset_0, "size_t", long .arg _.global_offset_1, "size_t", long .arg _.global_offset_2, "size_t", long .arg _.printf_buffer, "size_t", void*, global, , rdonly .arg _.vqueue_pointer, "size_t", long .arg _.aqlwrap_pointer, "size_t", long .arg dataset, "uchar*", uchar*, global, const, rdonly .arg scratchpad, "uchar*", uchar*, global, .arg registers, "ulong*", ulong*, global, .arg rounding_modes, "uint*", uint*, global, .arg programs, "uint*", uint*, global, .arg batch_size, "uint", uint .arg rx_parameters, "uint", uint .text s_mov_b32 m0, 0x10000 s_dcache_wb s_waitcnt vmcnt(0) & lgkmcnt(0) s_icache_inv s_branch begin # pgmrsrc2 = 0x00000090, bits 1:5 = 8, so first 8 SGPRs (s0-s7) contain user data # s8 contains group id # v0 contains local id begin: s_mov_b32 s8, s6 v_lshlrev_b32 v1, 6, s8 v_add_u32 v1, vcc, v1, v0 s_load_dwordx2 s[0:1], s[4:5], 0x0 s_load_dwordx2 s[2:3], s[4:5], 0x40 s_load_dwordx2 s[64:65], s[4:5], 0x48 s_waitcnt lgkmcnt(0) # load rounding mode s_lshl_b32 s16, s8, 2 s_add_u32 s64, s64, s16 s_addc_u32 s65, s65, 0 v_mov_b32 v8, s64 v_mov_b32 v9, s65 flat_load_dword v8, v[8:9] s_waitcnt vmcnt(0) v_readlane_b32 s66, v8, 0 s_setreg_b32 hwreg(mode, 2, 2), s66 s_mov_b32 s67, 0 # used in FSQRT_R to check for "positive normal value" (v_cmpx_class_f64) s_mov_b32 s68, 256 s_mov_b32 s69, 0 v_add_u32 v1, vcc, s0, v1 v_lshrrev_b32 v2, 6, v1 v_lshlrev_b32 v3, 5, v2 v_and_b32 v1, 63, v1 v_mov_b32 v4, 0 v_lshlrev_b64 v[3:4], 3, v[3:4] v_lshlrev_b32 v5, 4, v1 v_add_u32 v3, vcc, s2, v3 v_mov_b32 v6, s3 v_addc_u32 v4, vcc, v6, v4, vcc v_lshlrev_b32 v41, 2, v1 v_add_u32 v6, vcc, v3, v41 v_addc_u32 v7, vcc, v4, 0, vcc flat_load_dword v6, v[6:7] v_mov_b32 v0, 0 s_waitcnt vmcnt(0) ds_write_b32 v41, v6 s_waitcnt lgkmcnt(0) s_mov_b64 s[0:1], exec v_cmpx_le_u32 s[2:3], v1, 7 s_cbranch_execz program_end # rx_parameters s_load_dword s20, s[4:5], 0x5c s_waitcnt lgkmcnt(0) # Scratchpad L1 size s_bfe_u32 s21, s20, 0x050000 s_lshl_b32 s21, 1, s21 # Scratchpad L2 size s_bfe_u32 s22, s20, 0x050005 s_lshl_b32 s22, 1, s22 # Scratchpad L3 size s_bfe_u32 s23, s20, 0x05000A s_lshl_b32 s23, 1, s23 # program iterations s_bfe_u32 s24, s20, 0x04000F s_lshl_b32 s24, 1, s24 # Base address for scratchpads s_add_u32 s2, s23, 64 v_mul_hi_u32 v20, v2, s2 v_mul_lo_u32 v2, v2, s2 # v41, v44 = 0 v_mov_b32 v41, 0 v_mov_b32 v44, 0 ds_read_b32 v6, v0 offset:152 v_cmp_lt_u32 s[2:3], v1, 4 ds_read2_b64 v[34:37], v0 offset0:18 offset1:16 ds_read_b64 v[11:12], v0 offset:136 s_movk_i32 s9, 0x0 s_mov_b64 s[6:7], exec s_andn2_b64 exec, s[6:7], s[2:3] ds_read_b64 v[13:14], v0 offset:160 s_andn2_b64 exec, s[6:7], exec v_mov_b32 v13, 0 v_mov_b32 v14, 0 s_mov_b64 exec, s[6:7] # compiled program size s_mov_b64 s[6:7], s[8:9] s_mulk_i32 s6, 10048 v_add_u32 v5, vcc, v0, v5 v_add_u32 v5, vcc, v5, 64 s_mov_b64 s[8:9], exec s_andn2_b64 exec, s[8:9], s[2:3] ds_read_b64 v[15:16], v0 offset:168 s_andn2_b64 exec, s[8:9], exec v_mov_b32 v15, 0 v_mov_b32 v16, 0 s_mov_b64 exec, s[8:9] s_load_dwordx4 s[8:11], s[4:5], 0x30 # batch_size s_load_dword s16, s[4:5], 0x58 s_load_dwordx2 s[4:5], s[4:5], 0x50 v_lshlrev_b32 v1, 3, v1 v_add_u32 v17, vcc, v0, v1 s_waitcnt lgkmcnt(0) v_add_u32 v2, vcc, s10, v2 v_mov_b32 v18, s11 v_addc_u32 v18, vcc, v18, v20, vcc v_mov_b32 v19, 0xffffff v_add_u32 v6, vcc, s8, v6 v_mov_b32 v20, s9 v_addc_u32 v20, vcc, v20, 0, vcc ds_read_b64 v[21:22], v17 s_add_u32 s4, s4, s6 s_addc_u32 s5, s5, s7 v_cndmask_b32 v19, v19, -1, s[2:3] v_lshlrev_b32 v8, 3, v35 v_lshlrev_b32 v7, 3, v34 v_lshlrev_b32 v12, 3, v12 v_lshlrev_b32 v10, 3, v11 v_add_u32 v8, vcc, v8, v0 v_add_u32 v7, vcc, v7, v0 v_add_u32 v12, vcc, v12, v0 v_add_u32 v0, vcc, v10, v0 v_mov_b32 v10, v36 v_mov_b32 v23, v37 # loop counter s_sub_u32 s2, s24, 1 # batch_size s_mov_b32 s3, s16 # Scratchpad masks for scratchpads v_sub_u32 v38, vcc, s21, 8 v_sub_u32 v39, vcc, s22, 8 v_sub_u32 v50, vcc, s23, 8 # mask for FSCAL_R v_mov_b32 v51, 0x80F00000 # swap v3 and v18 v_mov_b32 v52, v3 v_mov_b32 v3, v18 v_mov_b32 v18, v52 # load scratchpad base address v_readlane_b32 s0, v2, 0 v_readlane_b32 s1, v3, 0 # save current executiom mask s_mov_b64 s[36:37], exec # v41 = 0 on lane 0, set it to 8 on lane 1 # v44 = 0 on lane 0, set it to 4 on lane 1 s_mov_b64 exec, 2 v_mov_b32 v41, 8 v_mov_b32 v44, 4 # load group A registers # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 s_mov_b64 exec, 3 ds_read2_b64 v[52:55], v41 offset0:24 offset1:26 ds_read2_b64 v[56:59], v41 offset0:28 offset1:30 # xmantissaMask v_mov_b32 v77, (1 << 24) - 1 # xexponentMask ds_read_b64 v[78:79], v41 offset:160 # Restore execution mask s_mov_b64 exec, s[36:37] # sign mask (used in FSQRT_R) v_mov_b32 v82, 0x80000000 # High 32 bits of "1.0" constant (used in FDIV_M) v_mov_b32 v83, (1023 << 20) # Used to multiply FP64 values by 0.5 v_mov_b32 v84, (1 << 20) s_getpc_b64 s[14:15] cur_addr: # get addresses of FSQRT_R subroutines s_add_u32 s40, s14, fsqrt_r_sub0 - cur_addr s_addc_u32 s41, s15, 0 s_add_u32 s42, s14, fsqrt_r_sub1 - cur_addr s_addc_u32 s43, s15, 0 s_add_u32 s44, s14, fsqrt_r_sub2 - cur_addr s_addc_u32 s45, s15, 0 s_add_u32 s46, s14, fsqrt_r_sub3 - cur_addr s_addc_u32 s47, s15, 0 # get addresses of FDIV_M subroutines s_add_u32 s48, s14, fdiv_m_sub0 - cur_addr s_addc_u32 s49, s15, 0 s_add_u32 s50, s14, fdiv_m_sub1 - cur_addr s_addc_u32 s51, s15, 0 s_add_u32 s52, s14, fdiv_m_sub2 - cur_addr s_addc_u32 s53, s15, 0 s_add_u32 s54, s14, fdiv_m_sub3 - cur_addr s_addc_u32 s55, s15, 0 # get address for ISMULH_R subroutine s_add_u32 s56, s14, ismulh_r_sub - cur_addr s_addc_u32 s57, s15, 0 # get address for IMULH_R subroutine s_add_u32 s58, s14, imulh_r_sub - cur_addr s_addc_u32 s59, s15, 0 # used in IXOR_R instruction s_mov_b32 s63, -1 # used in CBRANCH instruction s_mov_b32 s70, (0xFF << 8) s_mov_b32 s71, (0xFF << 9) s_mov_b32 s72, (0xFF << 10) s_mov_b32 s73, (0xFF << 11) s_mov_b32 s74, (0xFF << 12) s_mov_b32 s75, (0xFF << 13) s_mov_b32 s76, (0xFF << 14) s_mov_b32 s77, (0xFF << 15) s_mov_b32 s78, (0xFF << 16) s_mov_b32 s79, (0xFF << 17) s_mov_b32 s80, (0xFF << 18) s_mov_b32 s81, (0xFF << 19) s_mov_b32 s82, (0xFF << 20) s_mov_b32 s83, (0xFF << 21) s_mov_b32 s84, (0xFF << 22) s_mov_b32 s85, (0xFF << 23) # ScratchpadL3Mask64 s_sub_u32 s86, s23, 64 main_loop: # const uint2 spMix = as_uint2(R[readReg0] ^ R[readReg1]); ds_read_b64 v[24:25], v0 ds_read_b64 v[26:27], v12 s_waitcnt lgkmcnt(0) v_xor_b32 v25, v27, v25 v_xor_b32 v24, v26, v24 # spAddr1 ^= spMix.y; # spAddr0 ^= spMix.x; v_xor_b32 v10, v25, v10 v_xor_b32 v23, v24, v23 # spAddr1 &= ScratchpadL3Mask64; # spAddr0 &= ScratchpadL3Mask64; v_and_b32 v10, s86, v10 v_and_b32 v23, s86, v23 # Offset for scratchpads # offset1 = spAddr1 + sub * 8 # offset0 = spAddr0 + sub * 8 v_add_u32 v10, vcc, v10, v1 v_add_u32 v23, vcc, v23, v1 # __global ulong* p1 = (__global ulong*)(scratchpad + offset1); # __global ulong* p0 = (__global ulong*)(scratchpad + offset0); v_add_u32 v26, vcc, v2, v10 v_addc_u32 v27, vcc, v3, 0, vcc v_add_u32 v23, vcc, v2, v23 v_addc_u32 v24, vcc, v3, 0, vcc # load from spAddr1 flat_load_dwordx2 v[28:29], v[26:27] # load from spAddr0 flat_load_dwordx2 v[30:31], v[23:24] s_waitcnt vmcnt(1) v_cvt_f64_i32 v[32:33], v28 v_cvt_f64_i32 v[28:29], v29 s_waitcnt vmcnt(0) # R[sub] ^= *p0; v_xor_b32 v34, v21, v30 v_xor_b32 v35, v22, v31 v_add_u32 v22, vcc, v6, v36 v_addc_u32 v25, vcc, v20, 0, vcc v_add_u32 v21, vcc, v22, v1 v_addc_u32 v22, vcc, v25, 0, vcc flat_load_dwordx2 v[21:22], v[21:22] v_or_b32 v30, v32, v13 v_and_b32 v31, v33, v19 v_or_b32 v31, v31, v14 v_or_b32 v28, v28, v15 v_and_b32 v29, v29, v19 v_or_b32 v29, v29, v16 ds_write2_b64 v5, v[30:31], v[28:29] offset1:1 s_waitcnt lgkmcnt(0) # Program 0 # load group F,E registers # Read low 8 bytes into lane 0 and high 8 bytes into lane 1 s_mov_b64 exec, 3 ds_read2_b64 v[60:63], v41 offset0:8 offset1:10 ds_read2_b64 v[64:67], v41 offset0:12 offset1:14 ds_read2_b64 v[68:71], v41 offset0:16 offset1:18 ds_read2_b64 v[72:75], v41 offset0:20 offset1:22 # load VM integer registers v_readlane_b32 s16, v34, 0 v_readlane_b32 s17, v35, 0 v_readlane_b32 s18, v34, 1 v_readlane_b32 s19, v35, 1 v_readlane_b32 s20, v34, 2 v_readlane_b32 s21, v35, 2 v_readlane_b32 s22, v34, 3 v_readlane_b32 s23, v35, 3 v_readlane_b32 s24, v34, 4 v_readlane_b32 s25, v35, 4 v_readlane_b32 s26, v34, 5 v_readlane_b32 s27, v35, 5 v_readlane_b32 s28, v34, 6 v_readlane_b32 s29, v35, 6 v_readlane_b32 s30, v34, 7 v_readlane_b32 s31, v35, 7 s_waitcnt lgkmcnt(0) # call JIT code s_swappc_b64 s[12:13], s[4:5] # Write out group F,E registers # Write low 8 bytes from lane 0 and high 8 bytes from lane 1 ds_write2_b64 v41, v[60:61], v[62:63] offset0:8 offset1:10 ds_write2_b64 v41, v[64:65], v[66:67] offset0:12 offset1:14 ds_write2_b64 v41, v[68:69], v[70:71] offset0:16 offset1:18 ds_write2_b64 v41, v[72:73], v[74:75] offset0:20 offset1:22 # store VM integer registers v_writelane_b32 v28, s16, 0 v_writelane_b32 v29, s17, 0 v_writelane_b32 v28, s18, 1 v_writelane_b32 v29, s19, 1 v_writelane_b32 v28, s20, 2 v_writelane_b32 v29, s21, 2 v_writelane_b32 v28, s22, 3 v_writelane_b32 v29, s23, 3 v_writelane_b32 v28, s24, 4 v_writelane_b32 v29, s25, 4 v_writelane_b32 v28, s26, 5 v_writelane_b32 v29, s27, 5 v_writelane_b32 v28, s28, 6 v_writelane_b32 v29, s29, 6 v_writelane_b32 v28, s30, 7 v_writelane_b32 v29, s31, 7 # Restore execution mask s_mov_b64 exec, s[36:37] # Write out VM integer registers ds_write_b64 v17, v[28:29] s_waitcnt lgkmcnt(0) v_xor_b32 v21, v28, v21 v_xor_b32 v22, v29, v22 ds_read_b32 v28, v7 ds_read_b32 v29, v8 ds_write_b64 v17, v[21:22] s_waitcnt lgkmcnt(1) ds_read2_b64 v[30:33], v17 offset0:8 offset1:16 v_xor_b32 v10, v28, v37 s_waitcnt lgkmcnt(0) v_xor_b32 v30, v32, v30 v_xor_b32 v31, v33, v31 v_xor_b32 v10, v10, v29 flat_store_dwordx2 v[26:27], v[21:22] v_and_b32 v10, 0x7fffffc0, v10 flat_store_dwordx2 v[23:24], v[30:31] s_cmp_eq_u32 s2, 0 s_cbranch_scc1 main_loop_end s_sub_i32 s2, s2, 1 v_mov_b32 v37, v36 v_mov_b32 v23, 0 v_mov_b32 v36, v10 v_mov_b32 v10, 0 s_branch main_loop main_loop_end: v_add_u32 v0, vcc, v18, v1 v_addc_u32 v1, vcc, v4, 0, vcc flat_store_dwordx2 v[0:1], v[21:22] v_add_u32 v0, vcc, v0, 64 v_addc_u32 v1, vcc, v1, 0, vcc flat_store_dwordx2 v[0:1], v[30:31] v_add_u32 v0, vcc, v0, 64 v_addc_u32 v1, vcc, v1, 0, vcc flat_store_dwordx2 v[0:1], v[32:33] # store rounding mode v_mov_b32 v0, s64 v_mov_b32 v1, s65 v_mov_b32 v2, s66 flat_store_dword v[0:1], v2 program_end: s_endpgm fsqrt_r_sub0: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[68:69] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[68:69] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[68:69] v_mov_b32 v48, v28 v_sub_u32 v49, vcc, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[68:69] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 s[14:15], v[68:69], s[68:69] v_mov_b32 v68, v42 v_mov_b32 v69, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fsqrt_r_sub1: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[70:71] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[70:71] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[70:71] v_mov_b32 v48, v28 v_sub_u32 v49, vcc, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[70:71] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 s[14:15], v[70:71], s[68:69] v_mov_b32 v70, v42 v_mov_b32 v71, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fsqrt_r_sub2: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[72:73] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[72:73] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[72:73] v_mov_b32 v48, v28 v_sub_u32 v49, vcc, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[72:73] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 s[14:15], v[72:73], s[68:69] v_mov_b32 v72, v42 v_mov_b32 v73, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fsqrt_r_sub3: s_setreg_b32 hwreg(mode, 2, 2), s67 v_rsq_f64 v[28:29], v[74:75] # Improve initial approximation (can be skipped) #v_mul_f64 v[42:43], v[28:29], v[74:75] #v_mul_f64 v[48:49], v[28:29], -0.5 #v_fma_f64 v[48:49], v[48:49], v[42:43], 0.5 #v_fma_f64 v[28:29], v[28:29], v[48:49], v[28:29] v_mul_f64 v[42:43], v[28:29], v[74:75] v_mov_b32 v48, v28 v_sub_u32 v49, vcc, v29, v84 v_mov_b32 v46, v28 v_xor_b32 v47, v49, v82 v_fma_f64 v[46:47], v[46:47], v[42:43], 0.5 v_fma_f64 v[42:43], v[42:43], v[46:47], v[42:43] v_fma_f64 v[48:49], v[48:49], v[46:47], v[48:49] v_fma_f64 v[46:47], -v[42:43], v[42:43], v[74:75] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[46:47], v[48:49], v[42:43] v_cmpx_class_f64 s[14:15], v[74:75], s[68:69] v_mov_b32 v74, v42 v_mov_b32 v75, v43 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] fdiv_m_sub0: v_or_b32 v28, v28, v78 v_and_b32 v29, v29, v77 v_or_b32 v29, v29, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[68:69], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[68:69] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[68:69] v_cmpx_eq_f64 s[14:15], v[68:69], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v68, v80 v_mov_b32 v69, v81 s_setpc_b64 s[60:61] fdiv_m_sub1: v_or_b32 v28, v28, v78 v_and_b32 v29, v29, v77 v_or_b32 v29, v29, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[70:71], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[70:71] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[70:71] v_cmpx_eq_f64 s[14:15], v[70:71], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v70, v80 v_mov_b32 v71, v81 s_setpc_b64 s[60:61] fdiv_m_sub2: v_or_b32 v28, v28, v78 v_and_b32 v29, v29, v77 v_or_b32 v29, v29, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[72:73], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[72:73] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[72:73] v_cmpx_eq_f64 s[14:15], v[72:73], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v72, v80 v_mov_b32 v73, v81 s_setpc_b64 s[60:61] fdiv_m_sub3: v_or_b32 v28, v28, v78 v_and_b32 v29, v29, v77 v_or_b32 v29, v29, v79 s_setreg_b32 hwreg(mode, 2, 2), s67 v_rcp_f64 v[48:49], v[28:29] v_fma_f64 v[80:81], -v[28:29], v[48:49], 1.0 v_fma_f64 v[48:49], v[48:49], v[80:81], v[48:49] v_mul_f64 v[80:81], v[74:75], v[48:49] v_fma_f64 v[42:43], -v[28:29], v[80:81], v[74:75] s_setreg_b32 hwreg(mode, 2, 2), s66 v_fma_f64 v[42:43], v[42:43], v[48:49], v[80:81] v_div_fixup_f64 v[80:81], v[42:43], v[28:29], v[74:75] v_cmpx_eq_f64 s[14:15], v[74:75], v[28:29] v_mov_b32 v80, 0 v_mov_b32 v81, v83 s_mov_b64 exec, 3 v_mov_b32 v74, v80 v_mov_b32 v75, v81 s_setpc_b64 s[60:61] ismulh_r_sub: s_mov_b64 exec, 1 v_mov_b32 v45, s14 v_mul_hi_u32 v40, s38, v45 v_mov_b32 v47, s15 v_mad_u64_u32 v[42:43], s[32:33], s38, v47, v[40:41] v_mov_b32 v40, v42 v_mad_u64_u32 v[45:46], s[32:33], s39, v45, v[40:41] v_mad_u64_u32 v[42:43], s[32:33], s39, v47, v[43:44] v_add_u32 v42, vcc, v42, v46 v_addc_u32 v43, vcc, 0, v43, vcc v_readlane_b32 s32, v42, 0 v_readlane_b32 s33, v43, 0 s_cmp_lt_i32 s15, 0 s_cselect_b64 s[34:35], s[38:39], 0 s_sub_u32 s32, s32, s34 s_subb_u32 s33, s33, s35 s_cmp_lt_i32 s39, 0 s_cselect_b64 s[34:35], s[14:15], 0 s_sub_u32 s14, s32, s34 s_subb_u32 s15, s33, s35 s_mov_b64 exec, 3 s_setpc_b64 s[60:61] imulh_r_sub: s_mov_b64 exec, 1 v_mov_b32 v45, s38 v_mul_hi_u32 v40, s14, v45 v_mov_b32 v47, s39 v_mad_u64_u32 v[42:43], s[32:33], s14, v47, v[40:41] v_mov_b32 v40, v42 v_mad_u64_u32 v[45:46], s[32:33], s15, v45, v[40:41] v_mad_u64_u32 v[42:43], s[32:33], s15, v47, v[43:44] v_add_u32 v42, vcc, v42, v46 v_addc_u32 v43, vcc, 0, v43, vcc v_readlane_b32 s14, v42, 0 v_readlane_b32 s15, v43, 0 s_mov_b64 exec, 3 s_setpc_b64 s[60:61]