2059 lines
60 KiB
Common Lisp
2059 lines
60 KiB
Common Lisp
/*
|
|
Copyright (c) 2019 SChernykh
|
|
Portions Copyright (c) 2018-2019 tevador
|
|
|
|
This file is part of RandomX OpenCL.
|
|
|
|
RandomX OpenCL is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
RandomX OpenCL is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with RandomX OpenCL. If not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
|
|
|
|
#define CacheLineSize 64
|
|
#define ScratchpadL3Mask64 (RANDOMX_SCRATCHPAD_L3 - CacheLineSize)
|
|
#define CacheLineAlignMask ((RANDOMX_DATASET_BASE_SIZE - 1) & ~(CacheLineSize - 1))
|
|
|
|
#define mantissaSize 52
|
|
#define exponentSize 11
|
|
#define mantissaMask ((1UL << mantissaSize) - 1)
|
|
#define exponentMask ((1UL << exponentSize) - 1)
|
|
#define exponentBias 1023
|
|
#define constExponentBits 0x300
|
|
#define dynamicExponentBits 4
|
|
#define staticExponentBits 4
|
|
#define dynamicMantissaMask ((1UL << (mantissaSize + dynamicExponentBits)) - 1)
|
|
|
|
#define RegistersCount 8
|
|
#define RegisterCountFlt (RegistersCount / 2)
|
|
#define ConditionMask ((1 << RANDOMX_JUMP_BITS) - 1)
|
|
#define ConditionOffset RANDOMX_JUMP_OFFSET
|
|
#define StoreL3Condition 14
|
|
#define DatasetExtraItems (RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE)
|
|
|
|
#define RegisterNeedsDisplacement 5
|
|
|
|
//
|
|
// VM state:
|
|
//
|
|
// Bytes 0-255: registers
|
|
// Bytes 256-1023: imm32 values (up to 192 values can be stored). IMUL_RCP and CBRANCH use 2 consecutive imm32 values.
|
|
// Bytes 1024-2047: up to 256 instructions
|
|
//
|
|
// Instruction encoding:
|
|
//
|
|
// Bits 0-2: dst (0-7)
|
|
// Bits 3-5: src (0-7)
|
|
// Bits 6-13: imm32/64 offset (in DWORDs, 0-191)
|
|
// Bit 14: src location (register, scratchpad)
|
|
// Bits 15-16: src shift (0-3), ADD/MUL switch for FMA instruction
|
|
// Bit 17: src=imm32
|
|
// Bit 18: src=imm64
|
|
// Bit 19: src = -src
|
|
// Bits 20-23: opcode (add_rs, add, mul, umul_hi, imul_hi, neg, xor, ror, swap, cbranch, store, fswap, fma, fsqrt, fdiv, cfround)
|
|
// Bits 24-27: how many parallel instructions to run starting with this one (1-16)
|
|
// Bits 28-31: how many of them are FP instructions (0-8)
|
|
//
|
|
|
|
#define DST_OFFSET 0
|
|
#define SRC_OFFSET 3
|
|
#define IMM_OFFSET 6
|
|
#define LOC_OFFSET 14
|
|
#define SHIFT_OFFSET 15
|
|
#define SRC_IS_IMM32_OFFSET 17
|
|
#define SRC_IS_IMM64_OFFSET 18
|
|
#define NEGATIVE_SRC_OFFSET 19
|
|
#define OPCODE_OFFSET 20
|
|
#define NUM_INSTS_OFFSET 24
|
|
#define NUM_FP_INSTS_OFFSET 28
|
|
|
|
// ISWAP r0, r0
|
|
#define INST_NOP (8 << OPCODE_OFFSET)
|
|
|
|
typedef uchar uint8_t;
|
|
typedef ushort uint16_t;
|
|
typedef uint uint32_t;
|
|
typedef ulong uint64_t;
|
|
|
|
typedef int int32_t;
|
|
typedef long int64_t;
|
|
|
|
double getSmallPositiveFloatBits(uint64_t entropy)
|
|
{
|
|
uint64_t exponent = entropy >> 59; //0..31
|
|
uint64_t mantissa = entropy & mantissaMask;
|
|
exponent += exponentBias;
|
|
exponent &= exponentMask;
|
|
exponent <<= mantissaSize;
|
|
return as_double(exponent | mantissa);
|
|
}
|
|
|
|
uint64_t getStaticExponent(uint64_t entropy)
|
|
{
|
|
uint64_t exponent = constExponentBits;
|
|
exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits;
|
|
exponent <<= mantissaSize;
|
|
return exponent;
|
|
}
|
|
|
|
uint64_t getFloatMask(uint64_t entropy)
|
|
{
|
|
const uint64_t mask22bit = (1UL << 22) - 1;
|
|
return (entropy & mask22bit) | getStaticExponent(entropy);
|
|
}
|
|
|
|
void set_buffer(__local uint32_t *dst_buf, uint32_t N, const uint32_t value)
|
|
{
|
|
uint32_t i = get_local_id(0) * sizeof(uint32_t);
|
|
const uint32_t step = get_local_size(0) * sizeof(uint32_t);
|
|
__local uint8_t* dst = ((__local uint8_t*)dst_buf) + i;
|
|
while (i < sizeof(uint32_t) * N)
|
|
{
|
|
*(__local uint32_t*)(dst) = value;
|
|
dst += step;
|
|
i += step;
|
|
}
|
|
}
|
|
|
|
uint64_t imul_rcp_value(uint32_t divisor)
|
|
{
|
|
if ((divisor & (divisor - 1)) == 0)
|
|
{
|
|
return 1UL;
|
|
}
|
|
|
|
const uint64_t p2exp63 = 1UL << 63;
|
|
|
|
uint64_t quotient = p2exp63 / divisor;
|
|
uint64_t remainder = p2exp63 % divisor;
|
|
|
|
const uint32_t bsr = 31 - clz(divisor);
|
|
|
|
for (uint32_t shift = 0; shift <= bsr; ++shift)
|
|
{
|
|
const bool b = (remainder >= divisor - remainder);
|
|
quotient = (quotient << 1) | (b ? 1 : 0);
|
|
remainder = (remainder << 1) - (b ? divisor : 0);
|
|
}
|
|
|
|
return quotient;
|
|
}
|
|
|
|
#define set_byte(a, position, value) do { ((uint8_t*)&(a))[(position)] = (value); } while (0)
|
|
uint32_t get_byte(uint64_t a, uint32_t position) { return (a >> (position << 3)) & 0xFF; }
|
|
#define update_max(value, next_value) do { if ((value) < (next_value)) (value) = (next_value); } while (0)
|
|
|
|
__attribute__((reqd_work_group_size(32, 1, 1)))
|
|
__kernel void init_vm(__global const void* entropy_data, __global void* vm_states, __global uint32_t* rounding, uint32_t iteration)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
typedef uint8_t exec_t;
|
|
#else
|
|
typedef uint16_t exec_t;
|
|
#endif
|
|
|
|
__local uint32_t execution_plan_buf[RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH * (32 / 8) * sizeof(exec_t) / sizeof(uint32_t)];
|
|
|
|
set_buffer(execution_plan_buf, sizeof(execution_plan_buf) / sizeof(uint32_t), 0);
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
const uint32_t global_index = get_global_id(0);
|
|
const uint32_t idx = global_index / 8;
|
|
const uint32_t sub = global_index % 8;
|
|
|
|
__local exec_t* execution_plan = (__local exec_t*)(execution_plan_buf + (get_local_id(0) / 8) * RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH * sizeof(exec_t) / sizeof(uint32_t));
|
|
|
|
__global uint64_t* R = ((__global uint64_t*)vm_states) + idx * VM_STATE_SIZE / sizeof(uint64_t);
|
|
R[sub] = 0;
|
|
|
|
const __global uint64_t* entropy = ((const __global uint64_t*)entropy_data) + idx * ENTROPY_SIZE / sizeof(uint64_t);
|
|
|
|
__global double* A = (__global double*)(R + 24);
|
|
A[sub] = getSmallPositiveFloatBits(entropy[sub]);
|
|
|
|
if (sub == 0)
|
|
{
|
|
if (iteration == 0)
|
|
rounding[idx] = 0;
|
|
|
|
__global uint2* src_program = (__global uint2*)(entropy + 128 / sizeof(uint64_t));
|
|
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
uint64_t registerLastChanged = 0;
|
|
uint64_t registerWasChanged = 0;
|
|
#else
|
|
int32_t registerLastChanged[8] = { -1, -1, -1, -1, -1, -1, -1, -1 };
|
|
#endif
|
|
|
|
// Initialize CBRANCH instructions
|
|
for (uint32_t i = 0; i < RANDOMX_PROGRAM_SIZE; ++i)
|
|
{
|
|
// Clear all src flags (branch target, FP, branch)
|
|
*(__global uint32_t*)(src_program + i) &= ~(0xF8U << 8);
|
|
|
|
const uint2 src_inst = src_program[i];
|
|
uint2 inst = src_inst;
|
|
|
|
uint32_t opcode = inst.x & 0xff;
|
|
const uint32_t dst = (inst.x >> 8) & 7;
|
|
const uint32_t src = (inst.x >> 16) & 7;
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
set_byte(registerLastChanged, dst, i);
|
|
set_byte(registerWasChanged, dst, 1);
|
|
#else
|
|
registerLastChanged[dst] = i;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_RCP)
|
|
{
|
|
if (inst.y & (inst.y - 1))
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
set_byte(registerLastChanged, dst, i);
|
|
set_byte(registerWasChanged, dst, 1);
|
|
#else
|
|
registerLastChanged[dst] = i;
|
|
#endif
|
|
}
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_RCP;
|
|
|
|
if (opcode < RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
set_byte(registerLastChanged, dst, i);
|
|
set_byte(registerWasChanged, dst, 1);
|
|
#else
|
|
registerLastChanged[dst] = i;
|
|
#endif
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISWAP_R)
|
|
{
|
|
if (src != dst)
|
|
{
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
set_byte(registerLastChanged, dst, i);
|
|
set_byte(registerWasChanged, dst, 1);
|
|
set_byte(registerLastChanged, src, i);
|
|
set_byte(registerWasChanged, src, 1);
|
|
#else
|
|
registerLastChanged[dst] = i;
|
|
registerLastChanged[src] = i;
|
|
#endif
|
|
}
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R)
|
|
{
|
|
// Mark FP instruction (src |= 0x20)
|
|
*(__global uint32_t*)(src_program + i) |= 0x20 << 8;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_CBRANCH)
|
|
{
|
|
const uint32_t creg = dst;
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
const uint32_t change = get_byte(registerLastChanged, dst);
|
|
const int32_t lastChanged = (get_byte(registerWasChanged, dst) == 0) ? -1 : (int32_t)(change);
|
|
|
|
// Store condition register and branch target in CBRANCH instruction
|
|
*(__global uint32_t*)(src_program + i) = (src_inst.x & 0xFF0000FFU) | ((creg | ((lastChanged == -1) ? 0x90 : 0x10)) << 8) | (((uint32_t)(lastChanged) & 0xFF) << 16);
|
|
#else
|
|
const int32_t lastChanged = registerLastChanged[dst];
|
|
|
|
// Store condition register in CBRANCH instruction
|
|
*(__global uint32_t*)(src_program + i) = (src_inst.x & 0xFF0000FFU) | ((creg | 0x10) << 8);
|
|
#endif
|
|
|
|
// Mark branch target instruction (src |= 0x40)
|
|
*(__global uint32_t*)(src_program + lastChanged + 1) |= 0x40 << 8;
|
|
|
|
#if RANDOMX_PROGRAM_SIZE <= 256
|
|
uint32_t tmp = i | (i << 8);
|
|
registerLastChanged = tmp | (tmp << 16);
|
|
registerLastChanged = registerLastChanged | (registerLastChanged << 32);
|
|
|
|
registerWasChanged = 0x0101010101010101UL;
|
|
#else
|
|
registerLastChanged[0] = i;
|
|
registerLastChanged[1] = i;
|
|
registerLastChanged[2] = i;
|
|
registerLastChanged[3] = i;
|
|
registerLastChanged[4] = i;
|
|
registerLastChanged[5] = i;
|
|
registerLastChanged[6] = i;
|
|
registerLastChanged[7] = i;
|
|
#endif
|
|
}
|
|
}
|
|
|
|
uint64_t registerLatency = 0;
|
|
uint64_t registerReadCycle = 0;
|
|
uint64_t registerLatencyFP = 0;
|
|
uint64_t registerReadCycleFP = 0;
|
|
uint32_t ScratchpadHighLatency = 0;
|
|
volatile uint32_t ScratchpadLatency = 0;
|
|
|
|
int32_t first_available_slot = 0;
|
|
int32_t first_allowed_slot_cfround = 0;
|
|
int32_t last_used_slot = -1;
|
|
int32_t last_memory_op_slot = -1;
|
|
|
|
uint32_t num_slots_used = 0;
|
|
uint32_t num_instructions = 0;
|
|
|
|
int32_t first_instruction_slot = -1;
|
|
bool first_instruction_fp = false;
|
|
|
|
//if (global_index == 0)
|
|
//{
|
|
// for (int j = 0; j < RANDOMX_PROGRAM_SIZE; ++j)
|
|
// {
|
|
// print_inst(src_program[j]);
|
|
// printf("\n");
|
|
// }
|
|
// printf("\n");
|
|
//}
|
|
|
|
// Schedule instructions
|
|
bool update_branch_target_mark = false;
|
|
bool first_available_slot_is_branch_target = false;
|
|
for (uint32_t i = 0; i < RANDOMX_PROGRAM_SIZE; ++i)
|
|
{
|
|
const uint2 inst = src_program[i];
|
|
|
|
uint32_t opcode = inst.x & 0xff;
|
|
uint32_t dst = (inst.x >> 8) & 7;
|
|
const uint32_t src = (inst.x >> 16) & 7;
|
|
const uint32_t mod = (inst.x >> 24);
|
|
|
|
bool is_branch_target = (inst.x & (0x40 << 8)) != 0;
|
|
if (is_branch_target)
|
|
{
|
|
// If an instruction is a branch target, we can't move it before any previous instructions
|
|
first_available_slot = last_used_slot + 1;
|
|
|
|
// Mark this slot as a branch target
|
|
// Whatever instruction takes this slot will receive branch target flag
|
|
first_available_slot_is_branch_target = true;
|
|
}
|
|
|
|
const uint32_t dst_latency = get_byte(registerLatency, dst);
|
|
const uint32_t src_latency = get_byte(registerLatency, src);
|
|
const uint32_t reg_read_latency = (dst_latency > src_latency) ? dst_latency : src_latency;
|
|
const uint32_t mem_read_latency = ((dst == src) && ((inst.y & ScratchpadL3Mask64) >= RANDOMX_SCRATCHPAD_L2)) ? ScratchpadHighLatency : ScratchpadLatency;
|
|
|
|
uint32_t full_read_latency = mem_read_latency;
|
|
update_max(full_read_latency, reg_read_latency);
|
|
|
|
uint32_t latency = 0;
|
|
bool is_memory_op = false;
|
|
bool is_memory_store = false;
|
|
bool is_nop = false;
|
|
bool is_branch = false;
|
|
bool is_swap = false;
|
|
bool is_src_read = true;
|
|
bool is_fp = false;
|
|
bool is_cfround = false;
|
|
|
|
do {
|
|
if (opcode < RANDOMX_FREQ_IADD_RS)
|
|
{
|
|
latency = reg_read_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_RS;
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_M)
|
|
{
|
|
latency = full_read_latency;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_R)
|
|
{
|
|
latency = reg_read_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_M)
|
|
{
|
|
latency = full_read_latency;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_R)
|
|
{
|
|
latency = reg_read_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_M)
|
|
{
|
|
latency = full_read_latency;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_R)
|
|
{
|
|
latency = reg_read_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_M)
|
|
{
|
|
latency = full_read_latency;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_R)
|
|
{
|
|
latency = reg_read_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_M)
|
|
{
|
|
latency = full_read_latency;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_RCP)
|
|
{
|
|
is_src_read = false;
|
|
if (inst.y & (inst.y - 1))
|
|
latency = dst_latency;
|
|
else
|
|
is_nop = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_RCP;
|
|
|
|
if (opcode < RANDOMX_FREQ_INEG_R)
|
|
{
|
|
is_src_read = false;
|
|
latency = dst_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_INEG_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IXOR_R)
|
|
{
|
|
latency = reg_read_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IXOR_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IXOR_M)
|
|
{
|
|
latency = full_read_latency;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IXOR_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
|
|
{
|
|
latency = reg_read_latency;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISWAP_R)
|
|
{
|
|
is_swap = true;
|
|
if (dst != src)
|
|
latency = reg_read_latency;
|
|
else
|
|
is_nop = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSWAP_R)
|
|
{
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
is_fp = true;
|
|
is_src_read = false;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FADD_R)
|
|
{
|
|
dst %= RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
is_fp = true;
|
|
is_src_read = false;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FADD_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FADD_M)
|
|
{
|
|
dst %= RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
update_max(latency, src_latency);
|
|
update_max(latency, ScratchpadLatency);
|
|
is_fp = true;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_R)
|
|
{
|
|
dst %= RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
is_fp = true;
|
|
is_src_read = false;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_M)
|
|
{
|
|
dst %= RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
update_max(latency, src_latency);
|
|
update_max(latency, ScratchpadLatency);
|
|
is_fp = true;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSCAL_R)
|
|
{
|
|
dst %= RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
is_fp = true;
|
|
is_src_read = false;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSCAL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FMUL_R)
|
|
{
|
|
dst = (dst % RegisterCountFlt) + RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
is_fp = true;
|
|
is_src_read = false;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FDIV_M)
|
|
{
|
|
dst = (dst % RegisterCountFlt) + RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
update_max(latency, src_latency);
|
|
update_max(latency, ScratchpadLatency);
|
|
is_fp = true;
|
|
is_memory_op = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FDIV_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSQRT_R)
|
|
{
|
|
dst = (dst % RegisterCountFlt) + RegisterCountFlt;
|
|
latency = get_byte(registerLatencyFP, dst);
|
|
is_fp = true;
|
|
is_src_read = false;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSQRT_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_CBRANCH)
|
|
{
|
|
is_src_read = false;
|
|
is_branch = true;
|
|
latency = dst_latency;
|
|
|
|
// We can't move CBRANCH before any previous instructions
|
|
first_available_slot = last_used_slot + 1;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CBRANCH;
|
|
|
|
if (opcode < RANDOMX_FREQ_CFROUND)
|
|
{
|
|
latency = src_latency;
|
|
is_cfround = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CFROUND;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISTORE)
|
|
{
|
|
latency = reg_read_latency;
|
|
update_max(latency, (last_memory_op_slot + WORKERS_PER_HASH) / WORKERS_PER_HASH);
|
|
is_memory_op = true;
|
|
is_memory_store = true;
|
|
break;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISTORE;
|
|
|
|
is_nop = true;
|
|
} while (false);
|
|
|
|
if (is_nop)
|
|
{
|
|
if (is_branch_target)
|
|
{
|
|
// Mark next non-NOP instruction as the branch target instead of this NOP
|
|
update_branch_target_mark = true;
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if (update_branch_target_mark)
|
|
{
|
|
*(__global uint32_t*)(src_program + i) |= 0x40 << 8;
|
|
update_branch_target_mark = false;
|
|
is_branch_target = true;
|
|
}
|
|
|
|
int32_t first_allowed_slot = first_available_slot;
|
|
update_max(first_allowed_slot, latency * WORKERS_PER_HASH);
|
|
if (is_cfround)
|
|
update_max(first_allowed_slot, first_allowed_slot_cfround);
|
|
else
|
|
update_max(first_allowed_slot, get_byte(is_fp ? registerReadCycleFP : registerReadCycle, dst) * WORKERS_PER_HASH);
|
|
|
|
if (is_swap)
|
|
update_max(first_allowed_slot, get_byte(registerReadCycle, src) * WORKERS_PER_HASH);
|
|
|
|
int32_t slot_to_use = last_used_slot + 1;
|
|
update_max(slot_to_use, first_allowed_slot);
|
|
|
|
if (is_fp)
|
|
{
|
|
slot_to_use = -1;
|
|
for (int32_t j = first_allowed_slot; slot_to_use < 0; ++j)
|
|
{
|
|
if ((execution_plan[j] == 0) && (execution_plan[j + 1] == 0) && ((j + 1) % WORKERS_PER_HASH))
|
|
{
|
|
bool blocked = false;
|
|
for (int32_t k = (j / WORKERS_PER_HASH) * WORKERS_PER_HASH; k < j; ++k)
|
|
{
|
|
if (execution_plan[k] || (k == first_instruction_slot))
|
|
{
|
|
const uint32_t inst = src_program[execution_plan[k]].x;
|
|
|
|
// If there is an integer instruction which is a branch target or a branch, or this FP instruction is a branch target itself, we can't reorder it to add more FP instructions to this cycle
|
|
if (((inst & (0x20 << 8)) == 0) && (((inst & (0x50 << 8)) != 0) || is_branch_target))
|
|
{
|
|
blocked = true;
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!blocked)
|
|
{
|
|
for (int32_t k = (j / WORKERS_PER_HASH) * WORKERS_PER_HASH; k < j; ++k)
|
|
{
|
|
if (execution_plan[k] || (k == first_instruction_slot))
|
|
{
|
|
const uint32_t inst = src_program[execution_plan[k]].x;
|
|
if ((inst & (0x20 << 8)) == 0)
|
|
{
|
|
execution_plan[j] = execution_plan[k];
|
|
execution_plan[j + 1] = execution_plan[k + 1];
|
|
if (first_instruction_slot == k) first_instruction_slot = j;
|
|
if (first_instruction_slot == k + 1) first_instruction_slot = j + 1;
|
|
slot_to_use = k;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (slot_to_use < 0)
|
|
{
|
|
slot_to_use = j;
|
|
}
|
|
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
for (int32_t j = first_allowed_slot; j <= last_used_slot; ++j)
|
|
{
|
|
if (execution_plan[j] == 0)
|
|
{
|
|
slot_to_use = j;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (i == 0)
|
|
{
|
|
first_instruction_slot = slot_to_use;
|
|
first_instruction_fp = is_fp;
|
|
}
|
|
|
|
if (is_cfround)
|
|
{
|
|
first_allowed_slot_cfround = slot_to_use - (slot_to_use % WORKERS_PER_HASH) + WORKERS_PER_HASH;
|
|
}
|
|
|
|
++num_instructions;
|
|
|
|
execution_plan[slot_to_use] = i;
|
|
++num_slots_used;
|
|
|
|
if (is_fp)
|
|
{
|
|
execution_plan[slot_to_use + 1] = i;
|
|
++num_slots_used;
|
|
}
|
|
|
|
const uint32_t next_latency = (slot_to_use / WORKERS_PER_HASH) + 1;
|
|
|
|
if (is_src_read)
|
|
{
|
|
int32_t value = get_byte(registerReadCycle, src);
|
|
update_max(value, slot_to_use / WORKERS_PER_HASH);
|
|
set_byte(registerReadCycle, src, value);
|
|
}
|
|
|
|
if (is_memory_op)
|
|
{
|
|
update_max(last_memory_op_slot, slot_to_use);
|
|
}
|
|
|
|
if (is_cfround)
|
|
{
|
|
const uint32_t t = next_latency | (next_latency << 8);
|
|
registerLatencyFP = t | (t << 16);
|
|
registerLatencyFP = registerLatencyFP | (registerLatencyFP << 32);
|
|
}
|
|
else if (is_fp)
|
|
{
|
|
set_byte(registerLatencyFP, dst, next_latency);
|
|
|
|
int32_t value = get_byte(registerReadCycleFP, dst);
|
|
update_max(value, slot_to_use / WORKERS_PER_HASH);
|
|
set_byte(registerReadCycleFP, dst, value);
|
|
}
|
|
else
|
|
{
|
|
if (!is_memory_store && !is_nop)
|
|
{
|
|
set_byte(registerLatency, dst, next_latency);
|
|
if (is_swap)
|
|
set_byte(registerLatency, src, next_latency);
|
|
|
|
int32_t value = get_byte(registerReadCycle, dst);
|
|
update_max(value, slot_to_use / WORKERS_PER_HASH);
|
|
set_byte(registerReadCycle, dst, value);
|
|
}
|
|
|
|
if (is_branch)
|
|
{
|
|
const uint32_t t = next_latency | (next_latency << 8);
|
|
registerLatency = t | (t << 16);
|
|
registerLatency = registerLatency | (registerLatency << 32);
|
|
}
|
|
|
|
if (is_memory_store)
|
|
{
|
|
int32_t value = get_byte(registerReadCycle, dst);
|
|
update_max(value, slot_to_use / WORKERS_PER_HASH);
|
|
set_byte(registerReadCycle, dst, value);
|
|
ScratchpadLatency = (slot_to_use / WORKERS_PER_HASH) + 1;
|
|
if ((mod >> 4) >= StoreL3Condition)
|
|
ScratchpadHighLatency = (slot_to_use / WORKERS_PER_HASH) + 1;
|
|
}
|
|
}
|
|
|
|
if (execution_plan[first_available_slot] || (first_available_slot == first_instruction_slot))
|
|
{
|
|
if (first_available_slot_is_branch_target)
|
|
{
|
|
src_program[i].x |= 0x40 << 8;
|
|
first_available_slot_is_branch_target = false;
|
|
}
|
|
|
|
if (is_fp)
|
|
++first_available_slot;
|
|
|
|
do {
|
|
++first_available_slot;
|
|
} while ((first_available_slot < RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH) && (execution_plan[first_available_slot] != 0));
|
|
}
|
|
|
|
if (is_branch_target)
|
|
{
|
|
update_max(first_available_slot, is_fp ? (slot_to_use + 2) : (slot_to_use + 1));
|
|
}
|
|
|
|
update_max(last_used_slot, is_fp ? (slot_to_use + 1) : slot_to_use);
|
|
while (execution_plan[last_used_slot] || (last_used_slot == first_instruction_slot) || ((last_used_slot == first_instruction_slot + 1) && first_instruction_fp))
|
|
{
|
|
++last_used_slot;
|
|
}
|
|
--last_used_slot;
|
|
|
|
if (is_fp && (last_used_slot >= first_allowed_slot_cfround))
|
|
first_allowed_slot_cfround = last_used_slot + 1;
|
|
|
|
//if (global_index == 0)
|
|
//{
|
|
// printf("slot_to_use = %d, first_available_slot = %d, last_used_slot = %d\n", slot_to_use, first_available_slot, last_used_slot);
|
|
// for (int j = 0; j <= last_used_slot; ++j)
|
|
// {
|
|
// if (execution_plan[j] || (j == first_instruction_slot) || ((j == first_instruction_slot + 1) && first_instruction_fp))
|
|
// {
|
|
// print_inst(src_program[execution_plan[j]]);
|
|
// printf(" | ");
|
|
// }
|
|
// else
|
|
// {
|
|
// printf(" | ");
|
|
// }
|
|
// if (((j + 1) % WORKERS_PER_HASH) == 0) printf("\n");
|
|
// }
|
|
// printf("\n\n");
|
|
//}
|
|
}
|
|
|
|
//if (global_index == 0)
|
|
//{
|
|
// printf("IPC = %.3f, WPC = %.3f, num_instructions = %u, num_slots_used = %u, first_instruction_slot = %d, last_used_slot = %d, registerLatency = %016llx, registerLatencyFP = %016llx \n",
|
|
// num_instructions / static_cast<double>(last_used_slot / WORKERS_PER_HASH + 1),
|
|
// num_slots_used / static_cast<double>(last_used_slot / WORKERS_PER_HASH + 1),
|
|
// num_instructions,
|
|
// num_slots_used,
|
|
// first_instruction_slot,
|
|
// last_used_slot,
|
|
// registerLatency,
|
|
// registerLatencyFP
|
|
// );
|
|
|
|
// //for (int j = 0; j < RANDOMX_PROGRAM_SIZE; ++j)
|
|
// //{
|
|
// // print_inst(src_program[j]);
|
|
// // printf("\n");
|
|
// //}
|
|
// //printf("\n");
|
|
|
|
// for (int j = 0; j <= last_used_slot; ++j)
|
|
// {
|
|
// if (execution_plan[j] || (j == first_instruction_slot) || ((j == first_instruction_slot + 1) && first_instruction_fp))
|
|
// {
|
|
// print_inst(src_program[execution_plan[j]]);
|
|
// printf(" | ");
|
|
// }
|
|
// else
|
|
// {
|
|
// printf(" | ");
|
|
// }
|
|
// if (((j + 1) % WORKERS_PER_HASH) == 0) printf("\n");
|
|
// }
|
|
// printf("\n\n");
|
|
//}
|
|
|
|
//atomicAdd((uint32_t*)num_vm_cycles, (last_used_slot / WORKERS_PER_HASH) + 1);
|
|
//atomicAdd((uint32_t*)(num_vm_cycles) + 1, num_slots_used);
|
|
|
|
uint32_t ma = (uint32_t)(entropy[8]) & CacheLineAlignMask;
|
|
uint32_t mx = (uint32_t)(entropy[10]) & CacheLineAlignMask;
|
|
|
|
uint32_t addressRegisters = (uint32_t)(entropy[12]);
|
|
addressRegisters = ((addressRegisters & 1) | (((addressRegisters & 2) ? 3U : 2U) << 8) | (((addressRegisters & 4) ? 5U : 4U) << 16) | (((addressRegisters & 8) ? 7U : 6U) << 24)) * sizeof(uint64_t);
|
|
|
|
uint32_t datasetOffset = (entropy[13] & DatasetExtraItems) * CacheLineSize;
|
|
|
|
ulong2 eMask = *(__global ulong2*)(entropy + 14);
|
|
eMask.x = getFloatMask(eMask.x);
|
|
eMask.y = getFloatMask(eMask.y);
|
|
|
|
((__global uint32_t*)(R + 16))[0] = ma;
|
|
((__global uint32_t*)(R + 16))[1] = mx;
|
|
((__global uint32_t*)(R + 16))[2] = addressRegisters;
|
|
((__global uint32_t*)(R + 16))[3] = datasetOffset;
|
|
((__global ulong2*)(R + 18))[0] = eMask;
|
|
|
|
__global uint32_t* imm_buf = (__global uint32_t*)(R + REGISTERS_SIZE / sizeof(uint64_t));
|
|
uint32_t imm_index = 0;
|
|
int32_t imm_index_fscal_r = -1;
|
|
__global uint32_t* compiled_program = (__global uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t));
|
|
|
|
// Generate opcodes for execute_vm
|
|
int32_t branch_target_slot = -1;
|
|
int32_t k = -1;
|
|
for (int32_t i = 0; i <= last_used_slot; ++i)
|
|
{
|
|
if (!(execution_plan[i] || (i == first_instruction_slot) || ((i == first_instruction_slot + 1) && first_instruction_fp)))
|
|
continue;
|
|
|
|
uint32_t num_workers = 1;
|
|
uint32_t num_fp_insts = 0;
|
|
while ((i + num_workers <= last_used_slot) && ((i + num_workers) % WORKERS_PER_HASH) && (execution_plan[i + num_workers] || (i + num_workers == first_instruction_slot) || ((i + num_workers == first_instruction_slot + 1) && first_instruction_fp)))
|
|
{
|
|
if ((num_workers & 1) && ((src_program[execution_plan[i + num_workers]].x & (0x20 << 8)) != 0))
|
|
++num_fp_insts;
|
|
++num_workers;
|
|
}
|
|
|
|
//if (global_index == 0)
|
|
// printf("i = %d, num_workers = %u, num_fp_insts = %u\n", i, num_workers, num_fp_insts);
|
|
|
|
num_workers = ((num_workers - 1) << NUM_INSTS_OFFSET) | (num_fp_insts << NUM_FP_INSTS_OFFSET);
|
|
|
|
const uint2 src_inst = src_program[execution_plan[i]];
|
|
uint2 inst = src_inst;
|
|
|
|
uint32_t opcode = inst.x & 0xff;
|
|
const uint32_t dst = (inst.x >> 8) & 7;
|
|
const uint32_t src = (inst.x >> 16) & 7;
|
|
const uint32_t mod = (inst.x >> 24);
|
|
|
|
const bool is_fp = (src_inst.x & (0x20 << 8)) != 0;
|
|
if (is_fp && ((i & 1) == 0))
|
|
++i;
|
|
|
|
const bool is_branch_target = (src_inst.x & (0x40 << 8)) != 0;
|
|
if (is_branch_target && (branch_target_slot < 0))
|
|
branch_target_slot = k;
|
|
|
|
++k;
|
|
|
|
inst.x = INST_NOP;
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_RS)
|
|
{
|
|
const uint32_t shift = (mod >> 2) % 4;
|
|
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (shift << SHIFT_OFFSET);
|
|
|
|
if (dst != RegisterNeedsDisplacement)
|
|
{
|
|
// Encode regular ADD (opcode 1)
|
|
inst.x |= (1 << OPCODE_OFFSET);
|
|
}
|
|
else
|
|
{
|
|
// Encode ADD with src and imm32 (opcode 0)
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = inst.y;
|
|
}
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_RS;
|
|
|
|
if (opcode < RANDOMX_FREQ_IADD_M)
|
|
{
|
|
const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2);
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (1 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET);
|
|
if (src == dst)
|
|
{
|
|
inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET);
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = inst.y;
|
|
}
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISUB_M)
|
|
{
|
|
const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2);
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (1 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (2 << OPCODE_OFFSET);
|
|
if (src == dst)
|
|
{
|
|
inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET);
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = inst.y;
|
|
}
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_M)
|
|
{
|
|
const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2);
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (2 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (6 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMULH_M)
|
|
{
|
|
const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2);
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (6 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (4 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISMULH_M)
|
|
{
|
|
const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2);
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (4 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISMULH_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IMUL_RCP)
|
|
{
|
|
const uint64_t r = imul_rcp_value(inst.y);
|
|
if (r == 1)
|
|
{
|
|
*(compiled_program++) = INST_NOP | num_workers;
|
|
continue;
|
|
}
|
|
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (2 << OPCODE_OFFSET);
|
|
inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM64_OFFSET);
|
|
|
|
if (imm_index < IMM_INDEX_COUNT - 1)
|
|
{
|
|
imm_buf[imm_index] = ((const uint32_t*)&r)[0];
|
|
imm_buf[imm_index + 1] = ((const uint32_t*)&r)[1];
|
|
imm_index += 2;
|
|
}
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IMUL_RCP;
|
|
|
|
if (opcode < RANDOMX_FREQ_INEG_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (5 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_INEG_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IXOR_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (3 << OPCODE_OFFSET);
|
|
if (src == dst)
|
|
{
|
|
inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET);
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = inst.y;
|
|
}
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IXOR_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_IXOR_M)
|
|
{
|
|
const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2);
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (3 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IXOR_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (7 << OPCODE_OFFSET);
|
|
if (src == dst)
|
|
{
|
|
inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET);
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = inst.y;
|
|
}
|
|
if (opcode >= RANDOMX_FREQ_IROR_R)
|
|
{
|
|
inst.x |= (1 << NEGATIVE_SRC_OFFSET);
|
|
}
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISWAP_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (8 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = ((src != dst) ? inst.x : INST_NOP) | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSWAP_R)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (11 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSWAP_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FADD_R)
|
|
{
|
|
inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (12 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FADD_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FADD_M)
|
|
{
|
|
const uint32_t location = (mod % 4) ? 1 : 2;
|
|
inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (12 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FADD_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_R)
|
|
{
|
|
inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (12 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSUB_M)
|
|
{
|
|
const uint32_t location = (mod % 4) ? 1 : 2;
|
|
inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (12 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSUB_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSCAL_R)
|
|
{
|
|
inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (1 << SRC_IS_IMM64_OFFSET) | (3 << OPCODE_OFFSET);
|
|
if (imm_index_fscal_r >= 0)
|
|
{
|
|
inst.x |= (imm_index_fscal_r << IMM_OFFSET);
|
|
}
|
|
else
|
|
{
|
|
imm_index_fscal_r = imm_index;
|
|
inst.x |= (imm_index << IMM_OFFSET);
|
|
|
|
if (imm_index < IMM_INDEX_COUNT - 1)
|
|
{
|
|
imm_buf[imm_index] = 0;
|
|
imm_buf[imm_index + 1] = 0x80F00000UL;
|
|
imm_index += 2;
|
|
}
|
|
}
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSCAL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FMUL_R)
|
|
{
|
|
inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (1 << SHIFT_OFFSET) | (12 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FMUL_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_FDIV_M)
|
|
{
|
|
const uint32_t location = (mod % 4) ? 1 : 2;
|
|
inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (15 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FDIV_M;
|
|
|
|
if (opcode < RANDOMX_FREQ_FSQRT_R)
|
|
{
|
|
inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | (14 << OPCODE_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_FSQRT_R;
|
|
|
|
if (opcode < RANDOMX_FREQ_CBRANCH)
|
|
{
|
|
inst.x = (dst << DST_OFFSET) | (9 << OPCODE_OFFSET);
|
|
inst.x |= (imm_index << IMM_OFFSET);
|
|
|
|
const uint32_t cshift = (mod >> 4) + ConditionOffset;
|
|
|
|
uint32_t imm = inst.y | (1U << cshift);
|
|
if (cshift > 0)
|
|
imm &= ~(1U << (cshift - 1));
|
|
|
|
if (imm_index < IMM_INDEX_COUNT - 1)
|
|
{
|
|
imm_buf[imm_index] = imm;
|
|
imm_buf[imm_index + 1] = cshift | ((uint32_t)(branch_target_slot) << 5);
|
|
imm_index += 2;
|
|
}
|
|
else
|
|
{
|
|
// Data doesn't fit, skip it
|
|
inst.x = INST_NOP;
|
|
}
|
|
|
|
branch_target_slot = -1;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CBRANCH;
|
|
|
|
if (opcode < RANDOMX_FREQ_CFROUND)
|
|
{
|
|
inst.x = (src << SRC_OFFSET) | (13 << OPCODE_OFFSET) | ((inst.y & 63) << IMM_OFFSET);
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_CFROUND;
|
|
|
|
if (opcode < RANDOMX_FREQ_ISTORE)
|
|
{
|
|
const uint32_t location = ((mod >> 4) >= StoreL3Condition) ? 3 : ((mod % 4) ? 1 : 2);
|
|
inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (10 << OPCODE_OFFSET);
|
|
inst.x |= imm_index << IMM_OFFSET;
|
|
if (imm_index < IMM_INDEX_COUNT)
|
|
imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21);
|
|
else
|
|
inst.x = INST_NOP;
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
continue;
|
|
}
|
|
opcode -= RANDOMX_FREQ_ISTORE;
|
|
|
|
*(compiled_program++) = inst.x | num_workers;
|
|
}
|
|
|
|
((__global uint32_t*)(R + 20))[0] = (uint32_t)(compiled_program - (__global uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t)));
|
|
}
|
|
}
|
|
|
|
void load_buffer(__local uint64_t *dst_buf, size_t N, __global const void* src_buf)
|
|
{
|
|
uint32_t i = get_local_id(0) * sizeof(uint64_t);
|
|
const uint32_t step = get_local_size(0) * sizeof(uint64_t);
|
|
__global const uint8_t* src = ((__global const uint8_t*)src_buf) + get_group_id(0) * sizeof(uint64_t) * N + i;
|
|
__local uint8_t* dst = ((__local uint8_t*)dst_buf) + i;
|
|
while (i < sizeof(uint64_t) * N)
|
|
{
|
|
*(__local uint64_t*)(dst) = *(__global uint64_t*)(src);
|
|
src += step;
|
|
dst += step;
|
|
i += step;
|
|
}
|
|
}
|
|
|
|
double load_F_E_groups(int value, uint64_t andMask, uint64_t orMask)
|
|
{
|
|
double t = convert_double_rtn(value);
|
|
uint64_t x = as_ulong(t);
|
|
x &= andMask;
|
|
x |= orMask;
|
|
return as_double(x);
|
|
}
|
|
|
|
// You're one ugly motherfucker!
|
|
double fma_soft(double a, double b, double c, uint32_t rounding_mode)
|
|
{
|
|
if (rounding_mode == 0)
|
|
return fma(a, b, c);
|
|
|
|
if ((a == 0.0) || (b == 0.0))
|
|
return c;
|
|
|
|
if (b == 1.0)
|
|
{
|
|
if (c == 0.0)
|
|
return a;
|
|
|
|
if (c == -a)
|
|
{
|
|
const uint64_t minus_zero = 1UL << 63;
|
|
return (rounding_mode == 1) ? as_double(minus_zero) : 0.0;
|
|
}
|
|
}
|
|
|
|
const uint64_t mantissa_size = 52;
|
|
const uint64_t mantissa_mask = (1UL << 52) - 1;
|
|
|
|
const uint64_t exponent_size = 11;
|
|
const uint64_t exponent_mask = (1 << exponent_size) - 1;
|
|
|
|
uint2 a2 = as_uint2(a);
|
|
uint2 b2 = as_uint2(b);
|
|
uint2 c2 = as_uint2(c);
|
|
|
|
const uint32_t exponent_a = (a2.y >> 20) & exponent_mask;
|
|
const uint32_t exponent_b = (b2.y >> 20) & exponent_mask;
|
|
const uint32_t exponent_c = (c2.y >> 20) & exponent_mask;
|
|
|
|
if ((exponent_a == 2047) || (exponent_b == 2047) || (exponent_c == 2047))
|
|
{
|
|
const uint64_t inf = 2047UL << 52;
|
|
return as_double(inf);
|
|
}
|
|
|
|
const uint32_t sign_a = a2.y >> 31;
|
|
const uint32_t sign_b = b2.y >> 31;
|
|
const uint32_t sign_c = c2.y >> 31;
|
|
|
|
a2.y = (a2.y & ((1U << 20) - 1)) | (1U << 20);
|
|
b2.y = (b2.y & ((1U << 20) - 1)) | (1U << 20);
|
|
c2.y = (c2.y & ((1U << 20) - 1)) | (1U << 20);
|
|
|
|
uint64_t mantissa_a = as_ulong(a2);
|
|
uint64_t mantissa_b = as_ulong(b2);
|
|
uint64_t mantissa_c = as_ulong(c2);
|
|
|
|
uint64_t mul_result[2];
|
|
mul_result[0] = mantissa_a * mantissa_b;
|
|
mul_result[1] = mul_hi(mantissa_a, mantissa_b);
|
|
|
|
uint32_t exp_correction = mul_result[1] >> 41;
|
|
uint32_t exponent_mul_result = exponent_a + exponent_b + exp_correction - 1023;
|
|
uint32_t sign_mul_result = sign_a ^ sign_b;
|
|
|
|
if (exponent_mul_result >= 2047)
|
|
{
|
|
const uint64_t inf_rnd = (2047UL << 52) - (rounding_mode & 1);
|
|
return as_double(inf_rnd);
|
|
}
|
|
|
|
uint64_t fma_result[2];
|
|
uint64_t t[2];
|
|
uint32_t exponent_fma_result;
|
|
|
|
if (exponent_mul_result >= exponent_c)
|
|
{
|
|
uint32_t shift = 23 - exp_correction;
|
|
fma_result[0] = mul_result[0] << shift;
|
|
fma_result[1] = (mul_result[1] << shift) | (mul_result[0] >> (64 - shift));
|
|
|
|
int32_t shift2 = (127 - 52) + (int32_t)(exponent_c - exponent_mul_result);
|
|
|
|
if (shift2 >= 0)
|
|
{
|
|
if (shift2 >= 64)
|
|
{
|
|
t[0] = 0;
|
|
t[1] = mantissa_c << (shift2 - 64);
|
|
}
|
|
else
|
|
{
|
|
t[0] = mantissa_c << shift2;
|
|
t[1] = shift2 ? (mantissa_c >> (64 - shift2)) : 0;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
t[0] = (shift2 < -52) ? 0 : (mantissa_c >> (-shift2));
|
|
t[1] = 0;
|
|
if ((t[0] == 0) && (c != 0.0))
|
|
t[0] = 1;
|
|
}
|
|
|
|
exponent_fma_result = exponent_mul_result;
|
|
}
|
|
else
|
|
{
|
|
t[0] = 0;
|
|
t[1] = mantissa_c << 11;
|
|
|
|
int32_t shift2 = (127 - 104 - exp_correction) + (int32_t)(exponent_mul_result - exponent_c);
|
|
if (shift2 >= 0)
|
|
{
|
|
fma_result[0] = mul_result[0] << shift2;
|
|
fma_result[1] = (mul_result[1] << shift2) | (shift2 ? (mul_result[0] >> (64 - shift2)) : 0);
|
|
}
|
|
else
|
|
{
|
|
shift2 = -shift2;
|
|
if (shift2 >= 64)
|
|
{
|
|
shift2 -= 64;
|
|
fma_result[0] = (shift2 < 64) ? (mul_result[1] >> shift2) : 0;
|
|
fma_result[1] = 0;
|
|
if (fma_result[0] == 0)
|
|
fma_result[0] = 1;
|
|
}
|
|
else
|
|
{
|
|
fma_result[0] = (mul_result[0] >> shift2) | (mul_result[1] << (64 - shift2));
|
|
fma_result[1] = mul_result[1] >> shift2;
|
|
}
|
|
}
|
|
|
|
exponent_fma_result = exponent_c;
|
|
}
|
|
|
|
uint32_t sign_fma_result;
|
|
|
|
if (sign_mul_result == sign_c)
|
|
{
|
|
fma_result[0] += t[0];
|
|
fma_result[1] += t[1] + ((fma_result[0] < t[0]) ? 1 : 0);
|
|
|
|
exp_correction = (fma_result[1] < t[1]) ? 1 : 0;
|
|
sign_fma_result = sign_mul_result;
|
|
}
|
|
else
|
|
{
|
|
const uint32_t borrow = (fma_result[0] < t[0]) ? 1 : 0;
|
|
fma_result[0] -= t[0];
|
|
|
|
t[1] += borrow;
|
|
const uint32_t change_sign = (fma_result[1] < t[1]) ? 1 : 0;
|
|
fma_result[1] -= t[1];
|
|
|
|
sign_fma_result = sign_mul_result ^ change_sign;
|
|
if (change_sign)
|
|
{
|
|
fma_result[0] = -(int64_t)(fma_result[0]);
|
|
fma_result[1] = ~fma_result[1];
|
|
fma_result[1] += fma_result[0] ? 0 : 1;
|
|
}
|
|
|
|
if (fma_result[1] == 0)
|
|
{
|
|
if (fma_result[0] == 0)
|
|
return 0.0;
|
|
|
|
exponent_fma_result -= 64;
|
|
fma_result[1] = fma_result[0];
|
|
fma_result[0] = 0;
|
|
}
|
|
|
|
const uint32_t index = clz(fma_result[1]);
|
|
if (index)
|
|
{
|
|
exponent_fma_result -= index;
|
|
fma_result[1] = (fma_result[1] << index) | (fma_result[0] >> (64 - index));
|
|
}
|
|
|
|
exp_correction = 0;
|
|
}
|
|
|
|
const uint32_t shift = 11 + exp_correction;
|
|
const uint32_t round_up = (fma_result[0] || (fma_result[1] & ((1 << shift) - 1))) ? 1 : 0;
|
|
|
|
fma_result[1] >>= shift;
|
|
fma_result[1] &= mantissa_mask;
|
|
if (rounding_mode + sign_fma_result == 2)
|
|
{
|
|
fma_result[1] += round_up;
|
|
if (fma_result[1] == (1UL << mantissa_size))
|
|
{
|
|
fma_result[1] = 0;
|
|
++exponent_fma_result;
|
|
}
|
|
}
|
|
fma_result[1] |= (uint64_t)(exponent_fma_result + exp_correction) << mantissa_size;
|
|
fma_result[1] |= (uint64_t)(sign_fma_result) << 63;
|
|
|
|
return as_double(fma_result[1]);
|
|
}
|
|
|
|
double div_rnd(double a, double b, uint32_t fprc)
|
|
{
|
|
double y0 = 1.0 / b;
|
|
|
|
// Do 1 Newton-Raphson iteration to get correct rounding
|
|
const double t0 = a * y0;
|
|
const double t1 = fma(-b, t0, a);
|
|
double result = fma_soft(y0, t1, t0, fprc);
|
|
|
|
// Check for infinity/NaN
|
|
const uint64_t inf = 2047UL << 52;
|
|
const uint64_t inf_rnd = inf - (fprc & 1);
|
|
|
|
if (((as_ulong(result) >> 52) & 2047) == 2047) result = as_double(inf_rnd);
|
|
if (as_ulong(a) == inf) result = a;
|
|
|
|
return (a == b) ? 1.0 : result;
|
|
}
|
|
|
|
double sqrt_rnd(double x, uint32_t fprc)
|
|
{
|
|
double y0 = rsqrt(x);
|
|
|
|
// First Newton-Raphson iteration
|
|
double t0 = y0 * x;
|
|
double t1 = y0 * -0.5;
|
|
t1 = fma(t1, t0, 0.5); // 0.5 * (1.0 - y0 * y0 * x)
|
|
const double y1_x = fma(t0, t1, t0); // y1 * x = 0.5 * y0 * x * (3.0 - y0 * y0 * x)
|
|
|
|
// Second Newton-Raphson iteration
|
|
y0 *= 0.5;
|
|
y0 = fma(y0, t1, y0); // 0.5 * y1
|
|
t1 = fma(-y1_x, y1_x, x); // x * (1.0 - x * y1 * y1)
|
|
|
|
double result = fma_soft(t1, y0, y1_x, fprc); // x * 0.5 * y1 * (3.0 - x * y1 * y1)
|
|
|
|
// Check for infinity
|
|
if (*((uint64_t*) &x) == (2047UL << 52)) result = x;
|
|
|
|
return result;
|
|
}
|
|
|
|
uint32_t inner_loop(
|
|
const uint32_t program_length,
|
|
__local const uint32_t* compiled_program,
|
|
const int32_t sub,
|
|
__global uint8_t* scratchpad,
|
|
const uint32_t fp_reg_offset,
|
|
const uint32_t fp_reg_group_A_offset,
|
|
__local uint64_t* R,
|
|
__local uint32_t* imm_buf,
|
|
const uint32_t batch_size,
|
|
uint32_t fprc,
|
|
const uint32_t fp_workers_mask,
|
|
const uint64_t xexponentMask,
|
|
const uint32_t workers_mask
|
|
)
|
|
{
|
|
const int32_t sub2 = sub >> 1;
|
|
imm_buf[IMM_INDEX_COUNT + 1] = fprc;
|
|
|
|
#pragma unroll 1
|
|
for (int32_t ip = 0; ip < program_length;)
|
|
{
|
|
imm_buf[IMM_INDEX_COUNT] = ip;
|
|
|
|
uint32_t inst = compiled_program[ip];
|
|
const int32_t num_workers = (inst >> NUM_INSTS_OFFSET) & (WORKERS_PER_HASH - 1);
|
|
const int32_t num_fp_insts = (inst >> NUM_FP_INSTS_OFFSET) & (WORKERS_PER_HASH - 1);
|
|
const int32_t num_insts = num_workers - num_fp_insts;
|
|
|
|
if (sub <= num_workers)
|
|
{
|
|
const int32_t inst_offset = sub - num_fp_insts;
|
|
const bool is_fp = inst_offset < num_fp_insts;
|
|
inst = compiled_program[ip + (is_fp ? sub2 : inst_offset)];
|
|
//if ((idx == 0) && (ic == 0))
|
|
//{
|
|
// printf("num_fp_insts = %u, sub = %u, ip = %u, inst = %08x\n", num_fp_insts, sub, ip + ((sub < num_fp_insts * 2) ? (sub / 2) : (sub - num_fp_insts)), inst);
|
|
//}
|
|
|
|
//asm("// INSTRUCTION DECODING BEGIN");
|
|
|
|
uint32_t opcode = (inst >> OPCODE_OFFSET) & 15;
|
|
const uint32_t location = (inst >> LOC_OFFSET) & 1;
|
|
|
|
const uint32_t reg_size_shift = is_fp ? 4 : 3;
|
|
const uint32_t reg_base_offset = is_fp ? fp_reg_offset : 0;
|
|
const uint32_t reg_base_src_offset = is_fp ? fp_reg_group_A_offset : 0;
|
|
|
|
uint32_t dst_offset = (inst >> DST_OFFSET) & 7;
|
|
dst_offset = reg_base_offset + (dst_offset << reg_size_shift);
|
|
|
|
uint32_t src_offset = (inst >> SRC_OFFSET) & 7;
|
|
src_offset = (src_offset << 3) + (location ? 0 : reg_base_src_offset);
|
|
|
|
__local uint64_t* dst_ptr = (__local uint64_t*)((__local uint8_t*)(R) + dst_offset);
|
|
__local uint64_t* src_ptr = (__local uint64_t*)((__local uint8_t*)(R) + src_offset);
|
|
|
|
const uint32_t imm_offset = (inst >> IMM_OFFSET) & 255;
|
|
__local const uint32_t* imm_ptr = imm_buf + imm_offset;
|
|
|
|
uint64_t dst = *dst_ptr;
|
|
uint64_t src = *src_ptr;
|
|
uint2 imm;
|
|
imm.x = imm_ptr[0];
|
|
imm.y = imm_ptr[1];
|
|
|
|
//asm("// INSTRUCTION DECODING END");
|
|
|
|
if (location)
|
|
{
|
|
//asm("// SCRATCHPAD ACCESS BEGIN");
|
|
|
|
const uint32_t loc_shift = (imm.x >> 21) & 31;
|
|
const uint32_t mask = (0xFFFFFFFFU >> loc_shift) - 7;
|
|
|
|
const bool is_read = (opcode != 10);
|
|
uint32_t addr = is_read ? ((loc_shift == LOC_L3) ? 0 : (uint32_t)(src)) : (uint32_t)(dst);
|
|
addr += (int32_t)(imm.x);
|
|
addr &= mask;
|
|
|
|
__global uint64_t* ptr = (__global uint64_t*)(scratchpad + addr);
|
|
|
|
if (is_read)
|
|
{
|
|
src = *ptr;
|
|
}
|
|
else
|
|
{
|
|
*ptr = src;
|
|
goto execution_end;
|
|
}
|
|
|
|
//asm("// SCRATCHPAD ACCESS END");
|
|
}
|
|
|
|
{
|
|
//asm("// EXECUTION BEGIN");
|
|
|
|
if (inst & (1 << SRC_IS_IMM32_OFFSET)) src = (uint64_t)((int64_t)((int32_t)(imm.x)));
|
|
|
|
// Check instruction opcodes (most frequent instructions come first)
|
|
if (opcode <= 3)
|
|
{
|
|
//asm("// IADD_RS, IADD_M, ISUB_R, ISUB_M, IMUL_R, IMUL_M, IMUL_RCP, IXOR_R, IXOR_M, FSCAL_R (109/256) ------>");
|
|
if (inst & (1 << NEGATIVE_SRC_OFFSET)) src = (uint64_t)(-(int64_t)(src));
|
|
if (opcode == 0) dst += (int32_t)(imm.x);
|
|
const uint32_t shift = (inst >> SHIFT_OFFSET) & 3;
|
|
if (opcode < 2) dst += src << shift;
|
|
const uint64_t imm64 = *((uint64_t*) &imm);
|
|
if (inst & (1 << SRC_IS_IMM64_OFFSET)) src = imm64;
|
|
if (opcode == 2) dst *= src;
|
|
if (opcode == 3) dst ^= src;
|
|
//asm("// <------ IADD_RS, IADD_M, ISUB_R, ISUB_M, IMUL_R, IMUL_M, IMUL_RCP, IXOR_R, IXOR_M, FSCAL_R (109/256)");
|
|
}
|
|
else if (opcode == 12)
|
|
{
|
|
//asm("// FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256) ------>");
|
|
|
|
if (location) src = as_ulong(convert_double_rtn((int32_t)(src >> ((sub & 1) * 32))));
|
|
if (inst & (1 << NEGATIVE_SRC_OFFSET)) src ^= 0x8000000000000000UL;
|
|
|
|
const bool is_mul = (inst & (1 << SHIFT_OFFSET)) != 0;
|
|
const double a = as_double(dst);
|
|
const double b = as_double(src);
|
|
|
|
dst = as_ulong(fma_soft(a, is_mul ? b : 1.0, is_mul ? 0.0 : b, fprc));
|
|
|
|
//asm("// <------ FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256)");
|
|
}
|
|
else if (opcode == 9)
|
|
{
|
|
//asm("// CBRANCH (16/256) ------>");
|
|
dst += (int32_t)(imm.x);
|
|
if (((uint32_t)(dst) & (ConditionMask << (imm.y & 31))) == 0)
|
|
{
|
|
imm_buf[IMM_INDEX_COUNT] = (uint32_t)(((int32_t)(imm.y) >> 5) - num_insts);
|
|
}
|
|
//asm("// <------ CBRANCH (16/256)");
|
|
}
|
|
else if (opcode == 7)
|
|
{
|
|
//asm("// IROR_R, IROL_R (10/256) ------>");
|
|
uint32_t shift1 = src & 63;
|
|
#if RANDOMX_FREQ_IROL_R > 0
|
|
const uint32_t shift2 = 64 - shift1;
|
|
const bool is_rol = (inst & (1 << NEGATIVE_SRC_OFFSET));
|
|
dst = (dst >> (is_rol ? shift2 : shift1)) | (dst << (is_rol ? shift1 : shift2));
|
|
#else
|
|
dst = (dst >> shift1) | (dst << (64 - shift1));
|
|
#endif
|
|
//asm("// <------ IROR_R, IROL_R (10/256)");
|
|
}
|
|
else if (opcode == 14)
|
|
{
|
|
//asm("// FSQRT_R (6/256) ------>");
|
|
dst = as_ulong(sqrt_rnd(as_double(dst), fprc));
|
|
//asm("// <------ FSQRT_R (6/256)");
|
|
}
|
|
else if (opcode == 6)
|
|
{
|
|
//asm("// IMULH_R, IMULH_M (5/256) ------>");
|
|
dst = mul_hi(dst, src);
|
|
//asm("// <------ IMULH_R, IMULH_M (5/256)");
|
|
}
|
|
else if (opcode == 4)
|
|
{
|
|
//asm("// ISMULH_R, ISMULH_M (5/256) ------>");
|
|
dst = (uint64_t)(mul_hi((int64_t)(dst), (int64_t)(src)));
|
|
//asm("// <------ ISMULH_R, ISMULH_M (5/256)");
|
|
}
|
|
else if (opcode == 11)
|
|
{
|
|
//asm("// FSWAP_R (4/256) ------>");
|
|
dst = *(__local uint64_t*)((__local uint8_t*)(R) + (dst_offset ^ 8));
|
|
//asm("// <------ FSWAP_R (4/256)");
|
|
}
|
|
else if (opcode == 8)
|
|
{
|
|
//asm("// ISWAP_R (4/256) ------>");
|
|
*src_ptr = dst;
|
|
dst = src;
|
|
//asm("// <------ ISWAP_R (4/256)");
|
|
}
|
|
else if (opcode == 15)
|
|
{
|
|
//asm("// FDIV_M (4/256) ------>");
|
|
src = as_ulong(convert_double_rtn((int32_t)(src >> ((sub & 1) * 32))));
|
|
src &= dynamicMantissaMask;
|
|
src |= xexponentMask;
|
|
dst = as_ulong(div_rnd(as_double(dst), as_double(src), fprc));
|
|
//asm("// <------ FDIV_M (4/256)");
|
|
}
|
|
else if (opcode == 5)
|
|
{
|
|
//asm("// INEG_R (2/256) ------>");
|
|
dst = (uint64_t)(-(int64_t)(dst));
|
|
//asm("// <------ INEG_R (2/256)");
|
|
}
|
|
// CFROUND check will be skipped and removed entirely by the compiler if ROUNDING_MODE >= 0
|
|
else if (ROUNDING_MODE < 0)
|
|
{
|
|
//asm("// CFROUND (1/256) ------>");
|
|
imm_buf[IMM_INDEX_COUNT + 1] = ((src >> imm_offset) | (src << (64 - imm_offset))) & 3;
|
|
//asm("// <------ CFROUND (1/256)");
|
|
goto execution_end;
|
|
}
|
|
|
|
*dst_ptr = dst;
|
|
//asm("// EXECUTION END");
|
|
}
|
|
}
|
|
|
|
execution_end:
|
|
{
|
|
//asm("// SYNCHRONIZATION OF INSTRUCTION POINTER AND ROUNDING MODE BEGIN");
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
ip = imm_buf[IMM_INDEX_COUNT];
|
|
fprc = imm_buf[IMM_INDEX_COUNT + 1];
|
|
|
|
//asm("// SYNCHRONIZATION OF INSTRUCTION POINTER AND ROUNDING MODE END");
|
|
|
|
ip += num_insts + 1;
|
|
}
|
|
}
|
|
|
|
return fprc;
|
|
}
|
|
|
|
#if WORKERS_PER_HASH == 16
|
|
__attribute__((reqd_work_group_size(32, 1, 1)))
|
|
#else
|
|
__attribute__((reqd_work_group_size(16, 1, 1)))
|
|
#endif
|
|
__kernel void execute_vm(__global void* vm_states, __global void* rounding, __global void* scratchpads, __global const void* dataset_ptr, uint32_t batch_size, uint32_t num_iterations, uint32_t first, uint32_t last)
|
|
{
|
|
// 2 hashes per warp, 4 KB shared memory for VM states
|
|
__local uint64_t vm_states_local[(VM_STATE_SIZE * 2) / sizeof(uint64_t)];
|
|
|
|
load_buffer(vm_states_local, sizeof(vm_states_local) / sizeof(uint64_t), vm_states);
|
|
|
|
barrier(CLK_LOCAL_MEM_FENCE);
|
|
|
|
enum { IDX_WIDTH = (WORKERS_PER_HASH == 16) ? 16 : 8 };
|
|
|
|
__local uint64_t* R = vm_states_local + (get_local_id(0) / IDX_WIDTH) * VM_STATE_SIZE / sizeof(uint64_t);
|
|
__local double* F = (__local double*)(R + 8);
|
|
__local double* E = (__local double*)(R + 16);
|
|
|
|
const uint32_t global_index = get_global_id(0);
|
|
const int32_t idx = global_index / IDX_WIDTH;
|
|
const int32_t sub = global_index % IDX_WIDTH;
|
|
|
|
uint32_t ma = ((__local uint32_t*)(R + 16))[0];
|
|
uint32_t mx = ((__local uint32_t*)(R + 16))[1];
|
|
|
|
const uint32_t addressRegisters = ((__local uint32_t*)(R + 16))[2];
|
|
__local const uint64_t* readReg0 = (__local uint64_t*)(((__local uint8_t*)R) + (addressRegisters & 0xff));
|
|
__local const uint64_t* readReg1 = (__local uint64_t*)(((__local uint8_t*)R) + ((addressRegisters >> 8) & 0xff));
|
|
__local const uint32_t* readReg2 = (__local uint32_t*)(((__local uint8_t*)R) + ((addressRegisters >> 16) & 0xff));
|
|
__local const uint32_t* readReg3 = (__local uint32_t*)(((__local uint8_t*)R) + (addressRegisters >> 24));
|
|
|
|
const uint32_t datasetOffset = ((__local uint32_t*)(R + 16))[3];
|
|
__global const uint8_t* dataset = ((__global const uint8_t*)dataset_ptr) + datasetOffset;
|
|
|
|
const uint32_t fp_reg_offset = 64 + ((global_index & 1) << 3);
|
|
const uint32_t fp_reg_group_A_offset = 192 + ((global_index & 1) << 3);
|
|
|
|
__local uint64_t* eMask = R + 18;
|
|
|
|
const uint32_t program_length = ((__local uint32_t*)(R + 20))[0];
|
|
uint32_t fprc = ((__global uint32_t*)rounding)[idx];
|
|
|
|
uint32_t spAddr0 = first ? mx : 0;
|
|
uint32_t spAddr1 = first ? ma : 0;
|
|
|
|
__global uint8_t* scratchpad = ((__global uint8_t*)scratchpads) + idx * (uint64_t)(RANDOMX_SCRATCHPAD_L3 + 64);
|
|
|
|
const bool f_group = (sub < 4);
|
|
|
|
__local double* fe = f_group ? (F + sub * 2) : (E + (sub - 4) * 2);
|
|
__local double* f = F + sub;
|
|
__local double* e = E + sub;
|
|
|
|
const uint64_t andMask = f_group ? (uint64_t)(-1) : dynamicMantissaMask;
|
|
const uint64_t orMask1 = f_group ? 0 : eMask[0];
|
|
const uint64_t orMask2 = f_group ? 0 : eMask[1];
|
|
const uint64_t xexponentMask = (sub & 1) ? eMask[1] : eMask[0];
|
|
|
|
__local uint32_t* imm_buf = (__local uint32_t*)(R + REGISTERS_SIZE / sizeof(uint64_t));
|
|
__local const uint32_t* compiled_program = (__local const uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t));
|
|
|
|
const uint32_t workers_mask = ((1 << WORKERS_PER_HASH) - 1) << ((get_local_id(0) / IDX_WIDTH) * IDX_WIDTH);
|
|
const uint32_t fp_workers_mask = 3 << (((sub >> 1) << 1) + (get_local_id(0) / IDX_WIDTH) * IDX_WIDTH);
|
|
|
|
#pragma unroll 1
|
|
for (int ic = 0; ic < num_iterations; ++ic)
|
|
{
|
|
__local uint64_t *r;
|
|
__global uint64_t *p0, *p1;
|
|
if ((WORKERS_PER_HASH <= 8) || (sub < 8))
|
|
{
|
|
const uint64_t spMix = *readReg0 ^ *readReg1;
|
|
spAddr0 ^= ((const uint32_t*)&spMix)[0];
|
|
spAddr1 ^= ((const uint32_t*)&spMix)[1];
|
|
spAddr0 &= ScratchpadL3Mask64;
|
|
spAddr1 &= ScratchpadL3Mask64;
|
|
|
|
p0 = (__global uint64_t*)(scratchpad + spAddr0 + sub * 8);
|
|
p1 = (__global uint64_t*)(scratchpad + spAddr1 + sub * 8);
|
|
|
|
r = R + sub;
|
|
*r ^= *p0;
|
|
|
|
uint64_t global_mem_data = *p1;
|
|
int32_t* q = (int32_t*)&global_mem_data;
|
|
|
|
fe[0] = load_F_E_groups(q[0], andMask, orMask1);
|
|
fe[1] = load_F_E_groups(q[1], andMask, orMask2);
|
|
}
|
|
|
|
//if ((global_index == 0) && (ic == 0))
|
|
//{
|
|
// printf("ic = %d (before)\n", ic);
|
|
// for (int i = 0; i < 8; ++i)
|
|
// printf("f%d = %016llx\n", i, bit_cast<uint64_t>(F[i]));
|
|
// for (int i = 0; i < 8; ++i)
|
|
// printf("e%d = %016llx\n", i, bit_cast<uint64_t>(E[i]));
|
|
// printf("\n");
|
|
//}
|
|
|
|
if ((WORKERS_PER_HASH == IDX_WIDTH) || (sub < WORKERS_PER_HASH))
|
|
fprc = inner_loop(program_length, compiled_program, sub, scratchpad, fp_reg_offset, fp_reg_group_A_offset, R, imm_buf, batch_size, fprc, fp_workers_mask, xexponentMask, workers_mask);
|
|
|
|
//if ((global_index == 0) && (ic == RANDOMX_PROGRAM_ITERATIONS - 1))
|
|
//{
|
|
// printf("ic = %d (after)\n", ic);
|
|
// for (int i = 0; i < 8; ++i)
|
|
// printf("r%d = %016llx\n", i, R[i]);
|
|
// for (int i = 0; i < 8; ++i)
|
|
// printf("f%d = %016llx\n", i, bit_cast<uint64_t>(F[i]));
|
|
// for (int i = 0; i < 8; ++i)
|
|
// printf("e%d = %016llx\n", i, bit_cast<uint64_t>(E[i]));
|
|
// printf("\n");
|
|
//}
|
|
|
|
if ((WORKERS_PER_HASH <= 8) || (sub < 8))
|
|
{
|
|
mx ^= *readReg2 ^ *readReg3;
|
|
mx &= CacheLineAlignMask;
|
|
|
|
const uint64_t next_r = *r ^ *(__global const uint64_t*)(dataset + ma + sub * 8);
|
|
*r = next_r;
|
|
|
|
*p1 = next_r;
|
|
*p0 = as_ulong(f[0]) ^ as_ulong(e[0]);
|
|
|
|
uint32_t tmp = ma;
|
|
ma = mx;
|
|
mx = tmp;
|
|
|
|
spAddr0 = 0;
|
|
spAddr1 = 0;
|
|
}
|
|
}
|
|
|
|
//if (global_index == 0)
|
|
//{
|
|
// for (int i = 0; i < 8; ++i)
|
|
// printf("r%d = %016llx\n", i, R[i]);
|
|
// for (int i = 0; i < 8; ++i)
|
|
// printf("fe%d = %016llx\n", i, bit_cast<uint64_t>(F[i]) ^ bit_cast<uint64_t>(E[i]));
|
|
// printf("\n");
|
|
//}
|
|
|
|
if ((WORKERS_PER_HASH > 8) && (sub >= 8))
|
|
return;
|
|
|
|
__global uint64_t* p = ((__global uint64_t*)vm_states) + idx * (VM_STATE_SIZE / sizeof(uint64_t));
|
|
p[sub] = R[sub];
|
|
|
|
if (sub == 0)
|
|
{
|
|
((__global uint32_t*)rounding)[idx] = fprc;
|
|
}
|
|
|
|
if (last)
|
|
{
|
|
p[sub + 8] = as_ulong(F[sub]) ^ as_ulong(E[sub]);
|
|
p[sub + 16] = as_ulong(E[sub]);
|
|
}
|
|
else if (sub == 0)
|
|
{
|
|
((__global uint32_t*)(p + 16))[0] = ma;
|
|
((__global uint32_t*)(p + 16))[1] = mx;
|
|
}
|
|
}
|
|
|
|
__attribute__((reqd_work_group_size(64, 1, 1)))
|
|
__kernel void find_shares(__global const uint64_t* hashes, uint64_t target, uint32_t start_nonce, __global uint32_t* shares)
|
|
{
|
|
const uint32_t global_index = get_global_id(0);
|
|
|
|
if (hashes[global_index * 4 + 3] < target) {
|
|
//if (global_index == 0) {
|
|
const uint32_t idx = atomic_inc(shares + 0xFF);
|
|
if (idx < 0xFF) {
|
|
shares[idx] = start_nonce + global_index;
|
|
}
|
|
}
|
|
}
|