1602 lines
47 KiB
Common Lisp
1602 lines
47 KiB
Common Lisp
/*
|
|
Copyright (c) 2019 SChernykh
|
|
Portions Copyright (c) 2018-2019 tevador
|
|
|
|
This file is part of RandomX OpenCL.
|
|
|
|
RandomX OpenCL is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
RandomX OpenCL is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#define INITIAL_HASH_SIZE 64
|
|
#define INTERMEDIATE_PROGRAM_SIZE (RANDOMX_PROGRAM_SIZE * 16)
|
|
#define COMPILED_PROGRAM_SIZE 10048
|
|
#define NUM_VGPR_REGISTERS 128
|
|
|
|
#define mantissaSize 52
|
|
#define exponentSize 11
|
|
#define mantissaMask ((1UL << mantissaSize) - 1)
|
|
#define exponentMask ((1UL << exponentSize) - 1)
|
|
#define exponentBias 1023
|
|
|
|
#define dynamicExponentBits 4
|
|
#define staticExponentBits 4
|
|
#define constExponentBits 0x300
|
|
#define dynamicMantissaMask ((1UL << (mantissaSize + dynamicExponentBits)) - 1)
|
|
|
|
#define ScratchpadL1Mask_reg 38
|
|
#define ScratchpadL2Mask_reg 39
|
|
#define ScratchpadL3Mask_reg 50
|
|
|
|
#define ScratchpadL3Mask (RANDOMX_SCRATCHPAD_L3 - 8)
|
|
|
|
#define RANDOMX_JUMP_BITS 8
|
|
#define RANDOMX_JUMP_OFFSET 8
|
|
|
|
#if GCN_VERSION >= 15
|
|
|
|
#define S_SETPC_B64_S12_13 0xbe80200cu
|
|
#define V_AND_B32_CALC_ADDRESS 0x3638000eu
|
|
#define GLOBAL_LOAD_DWORDX2_SCRATCHPAD_LOAD 0xdc348000u
|
|
#define S_WAITCNT_SCRATCHPAD_LOAD2 0xbf8c3f70u
|
|
#define V_READLANE_B32_SCRATCHPAD_LOAD2 0xd7600000u
|
|
#define S_MUL_HI_U32_IMUL_R 0x9a8f1010u
|
|
#define S_MUL_I32_IMUL 0x93000000u
|
|
#define S_MUL_HI_U32_IMUL_R_2 0x9a8fff10u
|
|
#define S_MUL_HI_U32_IMUL_M 0x9aa10e10u
|
|
#define S_MOV_B32_IMUL_RCP 0xbea003ffu
|
|
#define S_MUL_HI_U32_IMUL_RCP 0x9a8f2010u
|
|
#define S_XOR_B32_64 0x89000000u
|
|
#define S_MOV_B32_XOR_R 0xbebe03ffu
|
|
#define S_LSHR 0x90000000u
|
|
#define S_LSHL 0x8f000000u
|
|
#define S_OR 0x88000000u
|
|
#define S_AND 0x87000000u
|
|
#define S_BFE 0x94000000u
|
|
#define DS_SWIZZLE_B32_FSWAP_R 0xd8d48001u
|
|
#define V_ADD_F64 0xd564003cu
|
|
#define V_AND_B32 0x36000000u
|
|
#define GLOBAL_LOAD_DWORD_SCRATCHPAD_LOAD_FP 0xdc308000u
|
|
#define V_XOR_B32 0x3a000000u
|
|
#define V_MUL_F64 0xd5650044u
|
|
|
|
#else
|
|
|
|
#define S_SETPC_B64_S12_13 0xbe801d0cu
|
|
#define V_AND_B32_CALC_ADDRESS 0x2638000eu
|
|
#define GLOBAL_LOAD_DWORDX2_SCRATCHPAD_LOAD 0xdc548000u
|
|
#define S_WAITCNT_SCRATCHPAD_LOAD2 0xbf8c0f70u
|
|
#define V_READLANE_B32_SCRATCHPAD_LOAD2 0xd2890000u
|
|
#define S_MUL_HI_U32_IMUL_R 0x960f1010u
|
|
#define S_MUL_I32_IMUL 0x92000000u
|
|
#define S_MUL_HI_U32_IMUL_R_2 0x960fff10u
|
|
#define S_MUL_HI_U32_IMUL_M 0x96210e10u
|
|
#define S_MOV_B32_IMUL_RCP 0xbea000ffu
|
|
#define S_MUL_HI_U32_IMUL_RCP 0x960f2010u
|
|
#define S_XOR_B32_64 0x88000000u
|
|
#define S_MOV_B32_XOR_R 0xbebe00ffu
|
|
#define S_LSHR 0x8f000000u
|
|
#define S_LSHL 0x8e000000u
|
|
#define S_OR 0x87000000u
|
|
#define S_AND 0x86000000u
|
|
#define S_BFE 0x93000000u
|
|
#define DS_SWIZZLE_B32_FSWAP_R 0xd87a8001u
|
|
#define V_ADD_F64 0xd280003cu
|
|
#define V_AND_B32 0x26000000u
|
|
#define GLOBAL_LOAD_DWORD_SCRATCHPAD_LOAD_FP 0xdc508000u
|
|
#define V_XOR_B32 0x2a000000u
|
|
#define V_MUL_F64 0xd2810044u
|
|
|
|
#endif
|
|
|
|
__global uint* jit_scratchpad_calc_address(__global uint* p, uint src, uint imm32, uint mask_reg, uint batch_size)
|
|
{
|
|
// s_add_i32 s14, s(16 + src * 2), imm32
|
|
*(p++) = 0x810eff10u | (src << 1);
|
|
*(p++) = imm32;
|
|
|
|
// v_and_b32 v28, s14, mask_reg
|
|
*(p++) = V_AND_B32_CALC_ADDRESS | (mask_reg << 9);
|
|
|
|
return p;
|
|
}
|
|
|
|
__global uint* jit_scratchpad_calc_fixed_address(__global uint* p, uint imm32, uint batch_size)
|
|
{
|
|
// v_mov_b32 v28, imm32
|
|
*(p++) = 0x7e3802ffu;
|
|
*(p++) = imm32;
|
|
|
|
return p;
|
|
}
|
|
|
|
__global uint* jit_scratchpad_load(__global uint* p, uint vgpr_index)
|
|
{
|
|
// v28 = offset
|
|
|
|
#if GCN_VERSION >= 14
|
|
// global_load_dwordx2 v[vgpr_index:vgpr_index+1], v28, s[0:1]
|
|
*(p++) = GLOBAL_LOAD_DWORDX2_SCRATCHPAD_LOAD;
|
|
*(p++) = 0x0000001cu | (vgpr_index << 24);
|
|
#else
|
|
*(p++) = 0x32543902u; // v_add_u32 v42, vcc, v2, v28
|
|
*(p++) = 0xd11c6a2bu; // v_addc_u32 v43, vcc, v3, 0, vcc
|
|
*(p++) = 0x01a90103u;
|
|
*(p++) = 0xdc540000u; // flat_load_dwordx2 v[vgpr_index:vgpr_index+1], v[42:43]
|
|
*(p++) = 0x0000002au | (vgpr_index << 24);
|
|
#endif
|
|
|
|
return p;
|
|
}
|
|
|
|
__global uint* jit_scratchpad_load2(__global uint* p, uint vgpr_index, int vmcnt)
|
|
{
|
|
// s_waitcnt vmcnt(N)
|
|
if (vmcnt >= 0)
|
|
*(p++) = S_WAITCNT_SCRATCHPAD_LOAD2 | (vmcnt & 15) | ((vmcnt >> 4) << 14);
|
|
|
|
// v_readlane_b32 s14, vgpr_index, 0
|
|
*(p++) = V_READLANE_B32_SCRATCHPAD_LOAD2 | 14;
|
|
*(p++) = 0x00010100u | vgpr_index;
|
|
|
|
// v_readlane_b32 s15, vgpr_index + 1, 0
|
|
*(p++) = V_READLANE_B32_SCRATCHPAD_LOAD2 | 15;
|
|
*(p++) = 0x00010100u | (vgpr_index + 1);
|
|
|
|
return p;
|
|
}
|
|
|
|
__global uint* jit_scratchpad_calc_address_fp(__global uint* p, uint src, uint imm32, uint mask_reg, uint batch_size)
|
|
{
|
|
// s_add_i32 s14, s(16 + src * 2), imm32
|
|
*(p++) = 0x810eff10u | (src << 1);
|
|
*(p++) = imm32;
|
|
|
|
// v_and_b32 v28, s14, mask_reg
|
|
*(p++) = V_AND_B32 | 0x38000eu | (mask_reg << 9);
|
|
|
|
#if GCN_VERSION >= 15
|
|
// v_add_nc_u32 v28, v28, v44
|
|
*(p++) = 0x4a38591cu;
|
|
#elif GCN_VERSION == 14
|
|
// v_add_u32 v28, v28, v44
|
|
*(p++) = 0x6838591cu;
|
|
#else
|
|
// v_add_u32 v28, vcc, v28, v44
|
|
*(p++) = 0x3238591cu;
|
|
#endif
|
|
|
|
return p;
|
|
}
|
|
|
|
__global uint* jit_scratchpad_load_fp(__global uint* p, uint vgpr_index)
|
|
{
|
|
// v28 = offset
|
|
|
|
#if GCN_VERSION >= 14
|
|
// global_load_dword v(vgpr_index), v28, s[0:1]
|
|
*(p++) = GLOBAL_LOAD_DWORD_SCRATCHPAD_LOAD_FP;
|
|
*(p++) = 0x0000001cu | (vgpr_index << 24);
|
|
#else
|
|
*(p++) = 0x32543902u; // v_add_u32 v42, vcc, v2, v28
|
|
*(p++) = 0xd11c6a2bu; // v_addc_u32 v43, vcc, v3, 0, vcc
|
|
*(p++) = 0x01a90103u;
|
|
*(p++) = 0xdc500000u; // flat_load_dword v(vgpr_index), v[42:43]
|
|
*(p++) = 0x0000002au | (vgpr_index << 24);
|
|
#endif
|
|
|
|
return p;
|
|
}
|
|
|
|
__global uint* jit_scratchpad_load2_fp(__global uint* p, uint vgpr_index, int vmcnt)
|
|
{
|
|
// s_waitcnt vmcnt(N)
|
|
if (vmcnt >= 0)
|
|
*(p++) = S_WAITCNT_SCRATCHPAD_LOAD2 | (vmcnt & 15) | ((vmcnt >> 4) << 14);
|
|
|
|
// v_cvt_f64_i32 v[28:29], vgpr_index
|
|
*(p++) = 0x7e380900u | vgpr_index;
|
|
|
|
return p;
|
|
}
|
|
|
|
__global uint* jit_emit_instruction(__global uint* p, __global uint* last_branch_target, const uint2 inst, int prefetch_vgpr_index, int vmcnt, uint batch_size)
|
|
{
|
|
uint opcode = inst.x & 0xFF;
|
|
const uint dst = (inst.x >> 8) & 7;
|
|
const uint src = (inst.x >> 16) & 7;
|
|
const uint mod = inst.x >> 24;
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_RS)
|
|
{
|
|
const uint shift = (mod >> 2) % 4;
|
|
if (shift > 0) // p = 3/4
|
|
{
|
|
// s_lshl_b64 s[14:15], s[(16 + src * 2):(17 + src * 2)], shift
|
|
*(p++) = S_LSHL | 0x8e8010u | (src << 1) | (shift << 8);
|
|
|
|
// s_add_u32 s(16 + dst * 2), s(16 + dst * 2), s14
|
|
*(p++) = 0x80100e10u | (dst << 1) | (dst << 17);
|
|
|
|
// s_addc_u32 s(17 + dst * 2), s(17 + dst * 2), s15
|
|
*(p++) = 0x82110f11u | (dst << 1) | (dst << 17);
|
|
}
|
|
else // p = 1/4
|
|
{
|
|
// s_add_u32 s(16 + dst * 2), s(16 + dst * 2), s(16 + src * 2)
|
|
*(p++) = 0x80101010u | (dst << 1) | (dst << 17) | (src << 9);
|
|
|
|
// s_addc_u32 s(17 + dst * 2), s(17 + dst * 2), s(17 + src * 2)
|
|
*(p++) = 0x82111111u | (dst << 1) | (dst << 17) | (src << 9);
|
|
}
|
|
|
|
if (dst == 5) // p = 1/8
|
|
{
|
|
// s_add_u32 s(16 + dst * 2), s(16 + dst * 2), imm32
|
|
*(p++) = 0x8010ff10u | (dst << 1) | (dst << 17);
|
|
*(p++) = inst.y;
|
|
|
|
// s_addc_u32 s(17 + dst * 2), s(17 + dst * 2), ((inst.y < 0) ? -1 : 0)
|
|
*(p++) = 0x82110011u | (dst << 1) | (dst << 17) | (((as_int(inst.y) < 0) ? 0xc1 : 0x80) << 8);
|
|
}
|
|
|
|
// 12*3/4 + 8*1/4 + 12/8 = 12.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_RS;
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
p = jit_scratchpad_calc_address(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
else // p = 1/8
|
|
p = jit_scratchpad_calc_fixed_address(p, inst.y & ScratchpadL3Mask, batch_size);
|
|
|
|
p = jit_scratchpad_load(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
// s_add_u32 s(16 + dst * 2), s(16 + dst * 2), s14
|
|
*(p++) = 0x80100e10u | (dst << 1) | (dst << 17);
|
|
|
|
// s_addc_u32 s(17 + dst * 2), s(17 + dst * 2), s15
|
|
*(p++) = 0x82110f11u | (dst << 1) | (dst << 17);
|
|
}
|
|
|
|
// (12*7/8 + 8*1/8 + 28) + 8 = 47.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_R)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
{
|
|
// s_sub_u32 s(16 + dst * 2), s(16 + dst * 2), s(16 + src * 2)
|
|
*(p++) = 0x80901010u | (dst << 1) | (dst << 17) | (src << 9);
|
|
|
|
// s_subb_u32 s(17 + dst * 2), s(17 + dst * 2), s(17 + src * 2)
|
|
*(p++) = 0x82911111u | (dst << 1) | (dst << 17) | (src << 9);
|
|
}
|
|
else // p = 1/8
|
|
{
|
|
// s_sub_u32 s(16 + dst * 2), s(16 + dst * 2), imm32
|
|
*(p++) = 0x8090ff10u | (dst << 1) | (dst << 17);
|
|
*(p++) = inst.y;
|
|
|
|
// s_subb_u32 s(17 + dst * 2), s(17 + dst * 2), ((inst.y < 0) ? -1 : 0)
|
|
*(p++) = 0x82910011u | (dst << 1) | (dst << 17) | (((as_int(inst.y) < 0) ? 0xc1 : 0x80) << 8);
|
|
}
|
|
|
|
// 8*7/8 + 12/8 = 8.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
p = jit_scratchpad_calc_address(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
else // p = 1/8
|
|
p = jit_scratchpad_calc_fixed_address(p, inst.y & ScratchpadL3Mask, batch_size);
|
|
|
|
p = jit_scratchpad_load(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
// s_sub_u32 s(16 + dst * 2), s(16 + dst * 2), s14
|
|
*(p++) = 0x80900e10u | (dst << 1) | (dst << 17);
|
|
|
|
// s_subb_u32 s(17 + dst * 2), s(17 + dst * 2), s15
|
|
*(p++) = 0x82910f11u | (dst << 1) | (dst << 17);
|
|
}
|
|
|
|
// (12*7/8 + 8*1/8 + 28) + 8 = 47.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_R)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
{
|
|
#if GCN_VERSION >= 14
|
|
// s_mul_hi_u32 s15, s(16 + dst * 2), s(16 + src * 2)
|
|
*(p++) = S_MUL_HI_U32_IMUL_R | (dst << 1) | (src << 9);
|
|
#else
|
|
// v_mov_b32 v28, s(16 + dst * 2)
|
|
*(p++) = 0x7e380210u | (dst << 1);
|
|
// v_mul_hi_u32 v28, v28, s(16 + src * 2)
|
|
*(p++) = 0xd286001cu;
|
|
*(p++) = 0x0000211cu + (src << 10);
|
|
// v_readlane_b32 s15, v28, 0
|
|
*(p++) = 0xd289000fu;
|
|
*(p++) = 0x0001011cu;
|
|
#endif
|
|
|
|
// s_mul_i32 s14, s(16 + dst * 2), s(17 + src * 2)
|
|
*(p++) = S_MUL_I32_IMUL | 0x0e1110u | (dst << 1) | (src << 9);
|
|
|
|
// s_add_u32 s15, s15, s14
|
|
*(p++) = 0x800f0e0fu;
|
|
|
|
// s_mul_i32 s14, s(17 + dst * 2), s(16 + src * 2)
|
|
*(p++) = S_MUL_I32_IMUL | 0x0e1011u | (dst << 1) | (src << 9);
|
|
|
|
// s_add_u32 s(17 + dst * 2), s15, s14
|
|
*(p++) = 0x80110e0fu | (dst << 17);
|
|
|
|
// s_mul_i32 s(16 + dst * 2), s(16 + dst * 2), s(16 + src * 2)
|
|
*(p++) = S_MUL_I32_IMUL | 0x101010u | (dst << 1) | (dst << 17) | (src << 9);
|
|
}
|
|
else // p = 1/8
|
|
{
|
|
#if GCN_VERSION >= 14
|
|
// s_mul_hi_u32 s15, s(16 + dst * 2), imm32
|
|
*(p++) = S_MUL_HI_U32_IMUL_R_2 | (dst << 1);
|
|
*(p++) = inst.y;
|
|
#else
|
|
// v_mov_b32 v28, imm32
|
|
*(p++) = 0x7e3802ffu;
|
|
*(p++) = inst.y;
|
|
// v_mul_hi_u32 v28, v28, s(16 + dst * 2)
|
|
*(p++) = 0xd286001cu;
|
|
*(p++) = 0x0000211cu + (dst << 10);
|
|
// v_readlane_b32 s15, v28, 0
|
|
*(p++) = 0xd289000fu;
|
|
*(p++) = 0x0001011cu;
|
|
#endif
|
|
|
|
if (as_int(inst.y) < 0) // p = 1/2
|
|
{
|
|
// s_sub_u32 s15, s15, s(16 + dst * 2)
|
|
*(p++) = 0x808f100fu | (dst << 9);
|
|
}
|
|
|
|
// s_mul_i32 s14, s(17 + dst * 2), imm32
|
|
*(p++) = S_MUL_I32_IMUL | 0x0eff11u | (dst << 1);
|
|
*(p++) = inst.y;
|
|
|
|
// s_add_u32 s(17 + dst * 2), s15, s14
|
|
*(p++) = 0x80110e0fu | (dst << 17);
|
|
|
|
// s_mul_i32 s(16 + dst * 2), s(16 + dst * 2), imm32
|
|
*(p++) = S_MUL_I32_IMUL | 0x10ff10u | (dst << 1) | (dst << 17);
|
|
*(p++) = inst.y;
|
|
}
|
|
|
|
// 24*7/8 + 28*1/8 + 4*1/16 = 24.75 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
p = jit_scratchpad_calc_address(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
else // p = 1/8
|
|
p = jit_scratchpad_calc_fixed_address(p, inst.y & ScratchpadL3Mask, batch_size);
|
|
|
|
p = jit_scratchpad_load(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
#if GCN_VERSION >= 14
|
|
// s_mul_hi_u32 s33, s(16 + dst * 2), s14
|
|
*(p++) = S_MUL_HI_U32_IMUL_M | (dst << 1);
|
|
#else
|
|
// v_mov_b32 v28, s(16 + dst * 2)
|
|
*(p++) = 0x7e380210u | (dst << 1);
|
|
// v_mul_hi_u32 v28, v28, s14
|
|
*(p++) = 0xd286001cu;
|
|
*(p++) = 0x00001d1cu;
|
|
// v_readlane_b32 s33, v28, 0
|
|
*(p++) = 0xd2890021u;
|
|
*(p++) = 0x0001011cu;
|
|
#endif
|
|
|
|
// s_mul_i32 s32, s(16 + dst * 2), s15
|
|
*(p++) = S_MUL_I32_IMUL | 0x200f10u | (dst << 1);
|
|
|
|
// s_add_u32 s33, s33, s32
|
|
*(p++) = 0x80212021u;
|
|
|
|
// s_mul_i32 s32, s(17 + dst * 2), s14
|
|
*(p++) = S_MUL_I32_IMUL | 0x200e11u | (dst << 1);
|
|
|
|
// s_add_u32 s(17 + dst * 2), s33, s32
|
|
*(p++) = 0x80112021u | (dst << 17);
|
|
|
|
// s_mul_i32 s(16 + dst * 2), s(16 + dst * 2), s14
|
|
*(p++) = S_MUL_I32_IMUL | 0x100e10u | (dst << 1) | (dst << 17);
|
|
}
|
|
|
|
// (12*7/8 + 8*1/8 + 28) + 24 = 63.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_R)
|
|
{
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbe8e0410u | (dst << 1); // s_mov_b64 s[14:15], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbea60410u | (src << 1); // s_mov_b64 s[38:39], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbebc213au; // s_swappc_b64 s[60:61], s[58:59]
|
|
*(p++) = 0xbe90040eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#else
|
|
*(p++) = 0xbe8e0110u | (dst << 1); // s_mov_b64 s[14:15], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbea60110u | (src << 1); // s_mov_b64 s[38:39], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbebc1e3au; // s_swappc_b64 s[60:61], s[58:59]
|
|
*(p++) = 0xbe90010eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#endif
|
|
|
|
// 16 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
p = jit_scratchpad_calc_address(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
else // p = 1/8
|
|
p = jit_scratchpad_calc_fixed_address(p, inst.y & ScratchpadL3Mask, batch_size);
|
|
|
|
p = jit_scratchpad_load(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbea60410u | (dst << 1); // s_mov_b64 s[38:39], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbebc213au; // s_swappc_b64 s[60:61], s[58:59]
|
|
*(p++) = 0xbe90040eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#else
|
|
*(p++) = 0xbea60110u | (dst << 1); // s_mov_b64 s[38:39], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbebc1e3au; // s_swappc_b64 s[60:61], s[58:59]
|
|
*(p++) = 0xbe90010eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#endif
|
|
}
|
|
|
|
// (12*7/8 + 8*1/8 + 28) + 12 = 51.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_R)
|
|
{
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbe8e0410u | (dst << 1); // s_mov_b64 s[14:15], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbea60410u | (src << 1); // s_mov_b64 s[38:39], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbebc2138u; // s_swappc_b64 s[60:61], s[56:57]
|
|
*(p++) = 0xbe90040eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#else
|
|
*(p++) = 0xbe8e0110u | (dst << 1); // s_mov_b64 s[14:15], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbea60110u | (src << 1); // s_mov_b64 s[38:39], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbebc1e38u; // s_swappc_b64 s[60:61], s[56:57]
|
|
*(p++) = 0xbe90010eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#endif
|
|
|
|
// 16 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
p = jit_scratchpad_calc_address(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
else // p = 1/8
|
|
p = jit_scratchpad_calc_fixed_address(p, inst.y & ScratchpadL3Mask, batch_size);
|
|
|
|
p = jit_scratchpad_load(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbea60410u | (dst << 1); // s_mov_b64 s[38:39], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbebc2138u; // s_swappc_b64 s[60:61], s[56:57]
|
|
*(p++) = 0xbe90040eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#else
|
|
*(p++) = 0xbea60110u | (dst << 1); // s_mov_b64 s[38:39], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbebc1e38u; // s_swappc_b64 s[60:61], s[56:57]
|
|
*(p++) = 0xbe90010eu | (dst << 17); // s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
#endif
|
|
}
|
|
|
|
// (12*7/8 + 8*1/8 + 28) + 12 = 51.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_RCP)
|
|
{
|
|
if (inst.y & (inst.y - 1))
|
|
{
|
|
const uint2 rcp_value = as_uint2(imul_rcp_value(inst.y));
|
|
|
|
*(p++) = S_MOV_B32_IMUL_RCP; // s_mov_b32 s32, imm32
|
|
*(p++) = rcp_value.x;
|
|
#if GCN_VERSION >= 14
|
|
*(p++) = S_MUL_HI_U32_IMUL_RCP | (dst << 1); // s_mul_hi_u32 s15, s(16 + dst * 2), s32
|
|
#else
|
|
// v_mov_b32 v28, s32
|
|
*(p++) = 0x7e380220u;
|
|
// v_mul_hi_u32 v28, v28, s(16 + dst * 2)
|
|
*(p++) = 0xd286001cu;
|
|
*(p++) = 0x0000211cu + (dst << 10);
|
|
// v_readlane_b32 s15, v28, 0
|
|
*(p++) = 0xd289000fu;
|
|
*(p++) = 0x0001011cu;
|
|
#endif
|
|
*(p++) = S_MUL_I32_IMUL | 0x0eff10u | (dst << 1); // s_mul_i32 s14, s(16 + dst * 2), imm32
|
|
*(p++) = rcp_value.y;
|
|
*(p++) = 0x800f0e0fu; // s_add_u32 s15, s15, s14
|
|
*(p++) = S_MUL_I32_IMUL | 0x0e2011u | (dst << 1); // s_mul_i32 s14, s(17 + dst * 2), s32
|
|
*(p++) = 0x80110e0fu | (dst << 17); // s_add_u32 s(17 + dst * 2), s15, s14
|
|
*(p++) = S_MUL_I32_IMUL | 0x102010u | (dst << 1) | (dst << 17);// s_mul_i32 s(16 + dst * 2), s(16 + dst * 2), s32
|
|
}
|
|
|
|
// 36 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_RCP;
|
|
|
|
if (opcode < RANDOMX_FREQ_INEG_R)
|
|
{
|
|
*(p++) = 0x80901080u | (dst << 9) | (dst << 17); // s_sub_u32 s(16 + dst * 2), 0, s(16 + dst * 2)
|
|
*(p++) = 0x82911180u | (dst << 9) | (dst << 17); // s_subb_u32 s(17 + dst * 2), 0, s(17 + dst * 2)
|
|
|
|
// 8 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_INEG_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IXOR_R)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
{
|
|
// s_xor_b64 s[16 + dst * 2:17 + dst * 2], s[16 + dst * 2:17 + dst * 2], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = S_XOR_B32_64 | 0x901010u | (dst << 1) | (dst << 17) | (src << 9);
|
|
}
|
|
else // p = 1/8
|
|
{
|
|
if (as_int(inst.y) < 0) // p = 1/2
|
|
{
|
|
// s_mov_b32 s62, imm32
|
|
*(p++) = S_MOV_B32_XOR_R;
|
|
*(p++) = inst.y;
|
|
|
|
// s_xor_b64 s[16 + dst * 2:17 + dst * 2], s[16 + dst * 2:17 + dst * 2], s[62:63]
|
|
*(p++) = S_XOR_B32_64 | 0x903e10u | (dst << 1) | (dst << 17);
|
|
}
|
|
else
|
|
{
|
|
// s_xor_b32 s(16 + dst * 2), s(16 + dst * 2), imm32
|
|
*(p++) = S_XOR_B32_64 | 0x10ff10u | (dst << 1) | (dst << 17);
|
|
*(p++) = inst.y;
|
|
}
|
|
}
|
|
|
|
// 4*7/8 + 12/16 + 8/16 = 4.75 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IXOR_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IXOR_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
p = jit_scratchpad_calc_address(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
else // p = 1/8
|
|
p = jit_scratchpad_calc_fixed_address(p, inst.y & ScratchpadL3Mask, batch_size);
|
|
|
|
p = jit_scratchpad_load(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
// s_xor_b64 s[16 + dst * 2:17 + dst * 2], s[16 + dst * 2:17 + dst * 2], s[14:15]
|
|
*(p++) = S_XOR_B32_64 | 0x900e10u | (dst << 1) | (dst << 17);
|
|
}
|
|
|
|
// (12*7/8 + 8*1/8 + 28) + 4 = 43.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IXOR_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
|
|
{
|
|
if (src != dst) // p = 7/8
|
|
{
|
|
if (opcode < RANDOMX_FREQ_IROR_R)
|
|
{
|
|
// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
|
|
*(p++) = S_LSHR | 0xa01010u | (dst << 1) | (src << 9);
|
|
|
|
// s_sub_u32 s15, 64, s(16 + src * 2)
|
|
*(p++) = 0x808f10c0u | (src << 9);
|
|
|
|
// s_lshl_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
|
|
*(p++) = S_LSHL | 0xa20f10u | (dst << 1);
|
|
}
|
|
else
|
|
{
|
|
// s_lshl_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], s(16 + src * 2)
|
|
*(p++) = S_LSHL | 0xa01010u | (dst << 1) | (src << 9);
|
|
|
|
// s_sub_u32 s15, 64, s(16 + src * 2)
|
|
*(p++) = 0x808f10c0u | (src << 9);
|
|
|
|
// s_lshr_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], s15
|
|
*(p++) = S_LSHR | 0xa20f10u | (dst << 1);
|
|
}
|
|
}
|
|
else // p = 1/8
|
|
{
|
|
const uint shift = ((opcode < RANDOMX_FREQ_IROR_R) ? inst.y : -inst.y) & 63;
|
|
|
|
// s_lshr_b64 s[32:33], s[16 + dst * 2:17 + dst * 2], shift
|
|
*(p++) = S_LSHR | 0xa08010u | (dst << 1) | (shift << 8);
|
|
|
|
// s_lshl_b64 s[34:35], s[16 + dst * 2:17 + dst * 2], 64 - shift
|
|
*(p++) = S_LSHL | 0xa28010u | (dst << 1) | ((64 - shift) << 8);
|
|
}
|
|
|
|
// s_or_b64 s[16 + dst * 2:17 + dst * 2], s[32:33], s[34:35]
|
|
*(p++) = S_OR | 0x902220u | (dst << 17);
|
|
|
|
// 12*7/8 + 8/8 + 4 = 15.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISWAP_R)
|
|
{
|
|
if (src != dst)
|
|
{
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbea00410u | (dst << 1); // s_mov_b64 s[32:33], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbe900410u | (src << 1) | (dst << 17);// s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbe900420u | (src << 17); // s_mov_b64 s[16 + src * 2:17 + Src * 2], s[32:33]
|
|
#else
|
|
*(p++) = 0xbea00110u | (dst << 1); // s_mov_b64 s[32:33], s[16 + dst * 2:17 + dst * 2]
|
|
*(p++) = 0xbe900110u | (src << 1) | (dst << 17);// s_mov_b64 s[16 + dst * 2:17 + dst * 2], s[16 + src * 2:17 + src * 2]
|
|
*(p++) = 0xbe900120u | (src << 17); // s_mov_b64 s[16 + src * 2:17 + Src * 2], s[32:33]
|
|
#endif
|
|
}
|
|
|
|
// 12*7/8 = 10.5 bytes on average
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSWAP_R)
|
|
{
|
|
// ds_swizzle_b32 v(60 + dst * 2), v(60 + dst * 2) offset:0x8001
|
|
*(p++) = DS_SWIZZLE_B32_FSWAP_R;
|
|
*(p++) = 0x3c00003cu + (dst << 1) + (dst << 25);
|
|
|
|
// ds_swizzle_b32 v(61 + dst * 2), v(61 + dst * 2) offset:0x8001
|
|
*(p++) = DS_SWIZZLE_B32_FSWAP_R;
|
|
*(p++) = 0x3d00003du + (dst << 1) + (dst << 25);
|
|
|
|
// s_waitcnt lgkmcnt(0)
|
|
*(p++) = 0xbf8cc07fu;
|
|
|
|
// 20 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FADD_R)
|
|
{
|
|
// v_add_f64 v[60 + dst * 2:61 + dst * 2], v[60 + dst * 2:61 + dst * 2], v[52 + src * 2:53 + src * 2]
|
|
*(p++) = V_ADD_F64 + ((dst & 3) << 1);
|
|
*(p++) = 0x0002693cu + ((dst & 3) << 1) + ((src & 3) << 10);
|
|
|
|
// 8 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FADD_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FADD_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
p = jit_scratchpad_calc_address_fp(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
p = jit_scratchpad_load_fp(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2_fp(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
// v_add_f64 v[60 + dst * 2:61 + dst * 2], v[60 + dst * 2:61 + dst * 2], v[28:29]
|
|
*(p++) = V_ADD_F64 + ((dst & 3) << 1);
|
|
*(p++) = 0x0002393cu + ((dst & 3) << 1);
|
|
}
|
|
|
|
// 32 + 8 = 40 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_R)
|
|
{
|
|
// v_add_f64 v[60 + dst * 2:61 + dst * 2], v[60 + dst * 2:61 + dst * 2], -v[52 + src * 2:53 + src * 2]
|
|
*(p++) = V_ADD_F64 + ((dst & 3) << 1);
|
|
*(p++) = 0x4002693cu + ((dst & 3) << 1) + ((src & 3) << 10);
|
|
|
|
// 8 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
p = jit_scratchpad_calc_address_fp(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
p = jit_scratchpad_load_fp(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2_fp(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
// v_add_f64 v[60 + dst * 2:61 + dst * 2], v[60 + dst * 2:61 + dst * 2], -v[28:29]
|
|
*(p++) = V_ADD_F64 + ((dst & 3) << 1);
|
|
*(p++) = 0x4002393cu + ((dst & 3) << 1);
|
|
}
|
|
|
|
// 32 + 8 = 40 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSCAL_R)
|
|
{
|
|
// v_xor_b32 v(61 + dst * 2), v(61 + dst * 2), v51
|
|
*(p++) = (V_XOR_B32 | 0x7a673du) + ((dst & 3) << 1) + ((dst & 3) << 18);
|
|
|
|
// 4 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSCAL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FMUL_R)
|
|
{
|
|
// v_mul_f64 v[68 + dst * 2:69 + dst * 2], v[68 + dst * 2:69 + dst * 2], v[52 + src * 2:53 + src * 2]
|
|
*(p++) = V_MUL_F64 + ((dst & 3) << 1);
|
|
*(p++) = 0x00026944u + ((dst & 3) << 1) + ((src & 3) << 10);
|
|
|
|
// 8 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FDIV_M)
|
|
{
|
|
if (prefetch_vgpr_index >= 0)
|
|
{
|
|
p = jit_scratchpad_calc_address_fp(p, src, inst.y, (mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg, batch_size);
|
|
p = jit_scratchpad_load_fp(p, prefetch_vgpr_index ? prefetch_vgpr_index : 28);
|
|
}
|
|
|
|
if (prefetch_vgpr_index <= 0)
|
|
{
|
|
p = jit_scratchpad_load2_fp(p, prefetch_vgpr_index ? -prefetch_vgpr_index : 28, prefetch_vgpr_index ? vmcnt : 0);
|
|
|
|
// s_swappc_b64 s[60:61], s[48 + dst * 2:49 + dst * 2]
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbebc2130u + ((dst & 3) << 1);
|
|
#else
|
|
*(p++) = 0xbebc1e30u + ((dst & 3) << 1);
|
|
#endif
|
|
}
|
|
|
|
// 32 + 4 = 36 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FDIV_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSQRT_R)
|
|
{
|
|
// s_swappc_b64 s[60:61], s[40 + dst * 2:41 + dst * 2]
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbebc2128u + ((dst & 3) << 1);
|
|
#else
|
|
*(p++) = 0xbebc1e28u + ((dst & 3) << 1);
|
|
#endif
|
|
|
|
// 4 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSQRT_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_CBRANCH)
|
|
{
|
|
const int shift = (mod >> 4) + RANDOMX_JUMP_OFFSET;
|
|
uint imm = inst.y | (1u << shift);
|
|
imm &= ~(1u << (shift - 1));
|
|
|
|
// s_add_u32 s(16 + dst * 2), s(16 + dst * 2), imm32
|
|
*(p++) = 0x8010ff10 | (dst << 1) | (dst << 17);
|
|
*(p++) = imm;
|
|
|
|
// s_addc_u32 s(17 + dst * 2), s(17 + dst * 2), ((imm < 0) ? -1 : 0)
|
|
*(p++) = 0x82110011u | (dst << 1) | (dst << 17) | (((as_int(imm) < 0) ? 0xc1 : 0x80) << 8);
|
|
|
|
const uint conditionMaskReg = 70 + (mod >> 4);
|
|
|
|
// s_and_b32 s14, s(16 + dst * 2), conditionMaskReg
|
|
*(p++) = S_AND | 0x0e0010u | (dst << 1) | (conditionMaskReg << 8);
|
|
|
|
// s_cbranch_scc0 target
|
|
const int delta = ((last_branch_target - p) - 1);
|
|
*(p++) = 0xbf840000u | (delta & 0xFFFF);
|
|
|
|
// 20 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CBRANCH;
|
|
|
|
if (opcode < RANDOMX_FREQ_CFROUND)
|
|
{
|
|
const uint shift = inst.y & 63;
|
|
if (shift == 63)
|
|
{
|
|
*(p++) = S_LSHL | 0x0e8110u | (src << 1); // s_lshl_b32 s14, s(16 + src * 2), 1
|
|
*(p++) = S_LSHR | 0x0f9f11u | (src << 1); // s_lshr_b32 s15, s(17 + src * 2), 31
|
|
*(p++) = S_OR | 0x0e0f0eu; // s_or_b32 s14, s14, s15
|
|
*(p++) = S_AND | 0x0e830eu; // s_and_b32 s14, s14, 3
|
|
}
|
|
else
|
|
{
|
|
// s_bfe_u64 s[14:15], s[16:17], (shift,width=2)
|
|
*(p++) = S_BFE | 0x8eff10u | (src << 1);
|
|
*(p++) = shift | (2 << 16);
|
|
}
|
|
|
|
// s_brev_b32 s14, s14
|
|
// s_lshr_b32 s66, s14, 30
|
|
// s_setreg_b32 hwreg(mode, 2, 2), s66
|
|
#if GCN_VERSION >= 15
|
|
*(p++) = 0xbe8e0b0eu;
|
|
*(p++) = 0x90429e0eu;
|
|
*(p++) = 0xb9c20881u;
|
|
#else
|
|
*(p++) = 0xbe8e080eu;
|
|
*(p++) = 0x8f429e0eu;
|
|
*(p++) = 0xb9420881u;
|
|
#endif
|
|
|
|
// 20 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CFROUND;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISTORE)
|
|
{
|
|
const uint mask = ((mod >> 4) < 14) ? ((mod % 4) ? ScratchpadL1Mask_reg : ScratchpadL2Mask_reg) : ScratchpadL3Mask_reg;
|
|
p = jit_scratchpad_calc_address(p, dst, inst.y, mask, batch_size);
|
|
|
|
const uint vgpr_id = 48;
|
|
*(p++) = 0x7e000210u | (src << 1) | (vgpr_id << 17); // v_mov_b32 vgpr_id, s(16 + src * 2)
|
|
*(p++) = 0x7e020211u | (src << 1) | (vgpr_id << 17); // v_mov_b32 vgpr_id + 1, s(17 + src * 2)
|
|
|
|
// v28 = offset
|
|
|
|
#if GCN_VERSION >= 14
|
|
#if GCN_VERSION >= 15
|
|
// s_waitcnt vmcnt(0)
|
|
*(p++) = 0xbf8c3f70u;
|
|
#endif
|
|
// global_store_dwordx2 v28, v[vgpr_id:vgpr_id + 1], s[0:1]
|
|
*(p++) = 0xdc748000u;
|
|
*(p++) = 0x0000001cu | (vgpr_id << 8);
|
|
#else
|
|
// v_add_u32 v28, vcc, v28, v2
|
|
*(p++) = 0x3238051cu;
|
|
// v_addc_u32 v29, vcc, 0, v3, vcc
|
|
*(p++) = 0x383a0680u;
|
|
// flat_store_dwordx2 v[28:29], v[vgpr_id:vgpr_id + 1]
|
|
*(p++) = 0xdc740000u;
|
|
*(p++) = 0x0000001cu | (vgpr_id << 8);
|
|
#endif
|
|
|
|
// 28 bytes
|
|
return p;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISTORE;
|
|
|
|
return p;
|
|
}
|
|
|
|
int jit_prefetch_read(
|
|
__global uint2* p0,
|
|
const int prefetch_data_count,
|
|
const uint i,
|
|
const uint src,
|
|
const uint dst,
|
|
const uint2 inst,
|
|
const uint srcAvailableAt,
|
|
const uint scratchpadAvailableAt,
|
|
const uint scratchpadHighAvailableAt,
|
|
const int lastBranchTarget,
|
|
const int lastBranch)
|
|
{
|
|
uint2 t;
|
|
t.x = (src == dst) ? (((inst.y & ScratchpadL3Mask) >= RANDOMX_SCRATCHPAD_L2) ? scratchpadHighAvailableAt : scratchpadAvailableAt) : max(scratchpadAvailableAt, srcAvailableAt);
|
|
t.y = i;
|
|
|
|
const int t1 = t.x;
|
|
|
|
if ((lastBranchTarget <= t1) && (t1 <= lastBranch))
|
|
{
|
|
// Don't move prefetch inside previous branch scope
|
|
t.x = lastBranch + 1;
|
|
}
|
|
else if ((lastBranchTarget > lastBranch) && (t1 < lastBranchTarget))
|
|
{
|
|
// Don't move prefetch outside current branch scope
|
|
t.x = lastBranchTarget;
|
|
}
|
|
|
|
p0[prefetch_data_count] = t;
|
|
return prefetch_data_count + 1;
|
|
}
|
|
|
|
__global uint* generate_jit_code(__global uint2* e, __global uint2* p0, __global uint* p, uint batch_size)
|
|
{
|
|
int prefetch_data_count;
|
|
|
|
#pragma unroll 1
|
|
for (volatile int pass = 0; pass < 2; ++pass)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
int registerLastChanged[8] = { -1, -1, -1, -1, -1, -1, -1, -1 };
|
|
#else
|
|
ulong registerLastChanged = 0;
|
|
uint registerWasChanged = 0;
|
|
#endif
|
|
|
|
uint scratchpadAvailableAt = 0;
|
|
uint scratchpadHighAvailableAt = 0;
|
|
|
|
int lastBranchTarget = -1;
|
|
int lastBranch = -1;
|
|
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
int registerLastChangedAtBranchTarget[8] = { -1, -1, -1, -1, -1, -1, -1, -1 };
|
|
#else
|
|
ulong registerLastChangedAtBranchTarget = 0;
|
|
uint registerWasChangedAtBranchTarget = 0;
|
|
#endif
|
|
uint scratchpadAvailableAtBranchTarget = 0;
|
|
uint scratchpadHighAvailableAtBranchTarget = 0;
|
|
|
|
prefetch_data_count = 0;
|
|
|
|
#pragma unroll 1
|
|
for (uint i = 0; i < RANDOMX_PROGRAM_SIZE; ++i)
|
|
{
|
|
// Clean flags
|
|
if (pass == 0)
|
|
e[i].x &= ~(0xf8u << 8);
|
|
|
|
uint2 inst = e[i];
|
|
uint opcode = inst.x & 0xFF;
|
|
const uint dst = (inst.x >> 8) & 7;
|
|
const uint src = (inst.x >> 16) & 7;
|
|
const uint mod = inst.x >> 24;
|
|
|
|
if (pass == 1)
|
|
{
|
|
// Branch target
|
|
if (inst.x & (0x20 << 8))
|
|
{
|
|
lastBranchTarget = i;
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
#pragma unroll
|
|
for (int j = 0; j < 8; ++j)
|
|
registerLastChangedAtBranchTarget[j] = registerLastChanged[j];
|
|
#else
|
|
registerLastChangedAtBranchTarget = registerLastChanged;
|
|
registerWasChangedAtBranchTarget = registerWasChanged;
|
|
#endif
|
|
scratchpadAvailableAtBranchTarget = scratchpadAvailableAt;
|
|
scratchpadHighAvailableAtBranchTarget = scratchpadHighAvailableAt;
|
|
}
|
|
|
|
// Branch
|
|
if (inst.x & (0x40 << 8))
|
|
lastBranch = i;
|
|
}
|
|
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
const uint srcAvailableAt = registerLastChanged[src] + 1;
|
|
const uint dstAvailableAt = registerLastChanged[dst] + 1;
|
|
#else
|
|
const uint srcAvailableAt = (registerWasChanged & (1u << src)) ? (((registerLastChanged >> (src * 8)) & 0xFF) + 1) : 0;
|
|
const uint dstAvailableAt = (registerWasChanged & (1u << dst)) ? (((registerLastChanged >> (dst * 8)) & 0xFF) + 1) : 0;
|
|
#endif
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_RS)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_RS;
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_M)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, dst, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_R)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_M)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, dst, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_R)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_M)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, dst, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_R)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_M)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, dst, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_R)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_M)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, dst, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_RCP)
|
|
{
|
|
if (inst.y & (inst.y - 1))
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
}
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_RCP;
|
|
|
|
if (opcode < RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IXOR_M)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, dst, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IXOR_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISWAP_R)
|
|
{
|
|
if (src != dst)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
registerLastChanged[src] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (src * 8))) | ((ulong)(i) << (src * 8));
|
|
registerWasChanged |= (1u << dst) | (1u << src);
|
|
#endif
|
|
}
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R)
|
|
{
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FADD_M)
|
|
{
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, 0xFF, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_R)
|
|
{
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_M)
|
|
{
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, 0xFF, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R)
|
|
{
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FDIV_M)
|
|
{
|
|
if (pass == 1)
|
|
prefetch_data_count = jit_prefetch_read(p0, prefetch_data_count, i, src, 0xFF, inst, srcAvailableAt, scratchpadAvailableAt, scratchpadHighAvailableAt, lastBranchTarget, lastBranch);
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FDIV_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSQRT_R)
|
|
{
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSQRT_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_CBRANCH)
|
|
{
|
|
if (pass == 0)
|
|
{
|
|
// Workaround for a bug in AMD 18.6.1 driver
|
|
volatile uint dstAvailableAt2 = dstAvailableAt;
|
|
|
|
// Mark branch target
|
|
e[dstAvailableAt2].x |= (0x20 << 8);
|
|
|
|
// Mark branch
|
|
e[i].x |= (0x40 << 8);
|
|
|
|
// Set all registers as changed at this instruction as per RandomX specification
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
#pragma unroll
|
|
for (int j = 0; j < 8; ++j)
|
|
registerLastChanged[j] = i;
|
|
#else
|
|
uint t = i | (i << 8);
|
|
t = t | (t << 16);
|
|
registerLastChanged = t;
|
|
registerLastChanged = registerLastChanged | (registerLastChanged << 32);
|
|
registerWasChanged = 0xFF;
|
|
#endif
|
|
}
|
|
else
|
|
{
|
|
// Update only registers which really changed inside this branch
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
registerLastChanged[dst] = i;
|
|
#else
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (dst * 8))) | ((ulong)(i) << (dst * 8));
|
|
registerWasChanged |= 1u << dst;
|
|
#endif
|
|
|
|
for (int reg = 0; reg < 8; ++reg)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE > 256
|
|
const uint availableAtBranchTarget = registerLastChangedAtBranchTarget[reg] + 1;
|
|
const uint availableAt = registerLastChanged[reg] + 1;
|
|
if (availableAt != availableAtBranchTarget)
|
|
{
|
|
registerLastChanged[reg] = i;
|
|
}
|
|
#else
|
|
const uint availableAtBranchTarget = (registerWasChangedAtBranchTarget & (1u << reg)) ? (((registerLastChangedAtBranchTarget >> (reg * 8)) & 0xFF) + 1) : 0;
|
|
const uint availableAt = (registerWasChanged & (1u << reg)) ? (((registerLastChanged >> (reg * 8)) & 0xFF) + 1) : 0;
|
|
if (availableAt != availableAtBranchTarget)
|
|
{
|
|
registerLastChanged = (registerLastChanged & ~(0xFFul << (reg * 8))) | ((ulong)(i) << (reg * 8));
|
|
registerWasChanged |= 1u << reg;
|
|
}
|
|
#endif
|
|
}
|
|
|
|
if (scratchpadAvailableAtBranchTarget != scratchpadAvailableAt)
|
|
scratchpadAvailableAt = i + 1;
|
|
|
|
if (scratchpadHighAvailableAtBranchTarget != scratchpadHighAvailableAt)
|
|
scratchpadHighAvailableAt = i + 1;
|
|
}
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CBRANCH;
|
|
|
|
if (opcode < RANDOMX_FREQ_CFROUND)
|
|
{
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CFROUND;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISTORE)
|
|
{
|
|
if (pass == 0)
|
|
{
|
|
// Mark ISTORE
|
|
e[i].x = inst.x | (0x80 << 8);
|
|
}
|
|
else
|
|
{
|
|
scratchpadAvailableAt = i + 1;
|
|
if ((mod >> 4) >= 14)
|
|
scratchpadHighAvailableAt = i + 1;
|
|
}
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISTORE;
|
|
}
|
|
}
|
|
|
|
// Sort p0
|
|
uint prev = p0[0].x;
|
|
#pragma unroll 1
|
|
for (int j = 1; j < prefetch_data_count; ++j)
|
|
{
|
|
uint2 cur = p0[j];
|
|
if (cur.x >= prev)
|
|
{
|
|
prev = cur.x;
|
|
continue;
|
|
}
|
|
|
|
int j1 = j - 1;
|
|
do {
|
|
p0[j1 + 1] = p0[j1];
|
|
--j1;
|
|
} while ((j1 >= 0) && (p0[j1].x >= cur.x));
|
|
p0[j1 + 1] = cur;
|
|
}
|
|
p0[prefetch_data_count].x = RANDOMX_PROGRAM_SIZE;
|
|
|
|
__global int* prefecth_vgprs_stack = (__global int*)(p0 + prefetch_data_count + 1);
|
|
|
|
// v86 - v127 will be used for global memory loads
|
|
enum { num_prefetch_vgprs = 21 };
|
|
|
|
#pragma unroll
|
|
for (int i = 0; i < num_prefetch_vgprs; ++i)
|
|
prefecth_vgprs_stack[i] = NUM_VGPR_REGISTERS - 2 - i * 2;
|
|
|
|
__global int* prefetched_vgprs = prefecth_vgprs_stack + num_prefetch_vgprs;
|
|
|
|
#pragma unroll 8
|
|
for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i)
|
|
prefetched_vgprs[i] = 0;
|
|
|
|
int k = 0;
|
|
uint2 prefetch_data = p0[0];
|
|
int mem_counter = 0;
|
|
int s_waitcnt_value = 63;
|
|
int num_prefetch_vgprs_available = num_prefetch_vgprs;
|
|
|
|
__global uint* last_branch_target = p;
|
|
|
|
const uint size_limit = (COMPILED_PROGRAM_SIZE - 200) / sizeof(uint);
|
|
__global uint* start_p = p;
|
|
|
|
#pragma unroll 1
|
|
for (int i = 0; i < RANDOMX_PROGRAM_SIZE; ++i)
|
|
{
|
|
const uint2 inst = e[i];
|
|
|
|
if (inst.x & (0x20 << 8))
|
|
last_branch_target = p;
|
|
|
|
bool done = false;
|
|
do {
|
|
uint2 jit_inst;
|
|
int jit_prefetch_vgpr_index;
|
|
int jit_vmcnt;
|
|
|
|
if (!done && (prefetch_data.x == i) && (num_prefetch_vgprs_available > 0))
|
|
{
|
|
++mem_counter;
|
|
const int vgpr_id = prefecth_vgprs_stack[--num_prefetch_vgprs_available];
|
|
prefetched_vgprs[prefetch_data.y] = vgpr_id | (mem_counter << 16);
|
|
|
|
jit_inst = e[prefetch_data.y];
|
|
jit_prefetch_vgpr_index = vgpr_id;
|
|
jit_vmcnt = mem_counter;
|
|
|
|
s_waitcnt_value = 63;
|
|
|
|
++k;
|
|
prefetch_data = p0[k];
|
|
}
|
|
else
|
|
{
|
|
const int prefetched_vgprs_data = prefetched_vgprs[i];
|
|
const int vgpr_id = prefetched_vgprs_data & 0xFFFF;
|
|
const int prev_mem_counter = prefetched_vgprs_data >> 16;
|
|
if (vgpr_id)
|
|
prefecth_vgprs_stack[num_prefetch_vgprs_available++] = vgpr_id;
|
|
|
|
if (inst.x & (0x80 << 8))
|
|
{
|
|
++mem_counter;
|
|
s_waitcnt_value = 63;
|
|
}
|
|
|
|
const int vmcnt = mem_counter - prev_mem_counter;
|
|
|
|
jit_inst = inst;
|
|
jit_prefetch_vgpr_index = -vgpr_id;
|
|
jit_vmcnt = (vmcnt < s_waitcnt_value) ? vmcnt : -1;
|
|
|
|
if (vmcnt < s_waitcnt_value)
|
|
s_waitcnt_value = vmcnt;
|
|
|
|
done = true;
|
|
}
|
|
|
|
p = jit_emit_instruction(p, last_branch_target, jit_inst, jit_prefetch_vgpr_index, jit_vmcnt, batch_size);
|
|
if (p - start_p > size_limit)
|
|
{
|
|
// Code size limit exceeded!!!
|
|
// Jump back to randomx_run kernel
|
|
*(p++) = S_SETPC_B64_S12_13; // s_setpc_b64 s[12:13]
|
|
return p;
|
|
}
|
|
} while (!done);
|
|
}
|
|
|
|
// Jump back to randomx_run kernel
|
|
*(p++) = S_SETPC_B64_S12_13; // s_setpc_b64 s[12:13]
|
|
return p;
|
|
}
|
|
|
|
__attribute__((reqd_work_group_size(64, 1, 1)))
|
|
__kernel void randomx_jit(__global ulong* entropy, __global ulong* registers, __global uint2* intermediate_programs, __global uint* programs, uint batch_size, __global uint32_t* rounding, uint32_t iteration)
|
|
{
|
|
const uint global_index = get_global_id(0) / 32;
|
|
const uint sub = get_global_id(0) % 32;
|
|
|
|
if (sub != 0)
|
|
return;
|
|
|
|
__global uint2* e = (__global uint2*)(entropy + global_index * (ENTROPY_SIZE / sizeof(ulong)) + (128 / sizeof(ulong)));
|
|
__global uint2* p0 = intermediate_programs + global_index * (INTERMEDIATE_PROGRAM_SIZE / sizeof(uint2));
|
|
__global uint* p = programs + global_index * (COMPILED_PROGRAM_SIZE / sizeof(uint));
|
|
|
|
generate_jit_code(e, p0, p, batch_size);
|
|
|
|
if (iteration == 0)
|
|
rounding[global_index] = 0;
|
|
|
|
__global ulong* R = registers + global_index * 32;
|
|
entropy += global_index * (ENTROPY_SIZE / sizeof(ulong));
|
|
|
|
// Group R registers
|
|
R[0] = 0;
|
|
R[1] = 0;
|
|
R[2] = 0;
|
|
R[3] = 0;
|
|
R[4] = 0;
|
|
R[5] = 0;
|
|
R[6] = 0;
|
|
R[7] = 0;
|
|
|
|
// Group A registers
|
|
__global double* A = (__global double*)(R + 24);
|
|
A[0] = getSmallPositiveFloatBits(entropy[0]);
|
|
A[1] = getSmallPositiveFloatBits(entropy[1]);
|
|
A[2] = getSmallPositiveFloatBits(entropy[2]);
|
|
A[3] = getSmallPositiveFloatBits(entropy[3]);
|
|
A[4] = getSmallPositiveFloatBits(entropy[4]);
|
|
A[5] = getSmallPositiveFloatBits(entropy[5]);
|
|
A[6] = getSmallPositiveFloatBits(entropy[6]);
|
|
A[7] = getSmallPositiveFloatBits(entropy[7]);
|
|
|
|
// ma, mx
|
|
((__global uint*)(R + 16))[0] = entropy[8] & CacheLineAlignMask;
|
|
((__global uint*)(R + 16))[1] = entropy[10];
|
|
|
|
// address registers
|
|
uint addressRegisters = entropy[12];
|
|
((__global uint*)(R + 17))[0] = 0 + (addressRegisters & 1);
|
|
addressRegisters >>= 1;
|
|
((__global uint*)(R + 17))[1] = 2 + (addressRegisters & 1);
|
|
addressRegisters >>= 1;
|
|
((__global uint*)(R + 17))[2] = 4 + (addressRegisters & 1);
|
|
addressRegisters >>= 1;
|
|
((__global uint*)(R + 17))[3] = 6 + (addressRegisters & 1);
|
|
|
|
// dataset offset
|
|
((__global uint*)(R + 19))[0] = (entropy[13] & DatasetExtraItems) * CacheLineSize;
|
|
|
|
// eMask
|
|
R[20] = getFloatMask(entropy[14]);
|
|
R[21] = getFloatMask(entropy[15]);
|
|
}
|