/* Copyright (c) 2019 SChernykh Portions Copyright (c) 2018-2019 tevador This file is part of RandomX OpenCL. RandomX OpenCL is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. RandomX OpenCL is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with RandomX OpenCL. If not, see . */ #pragma OPENCL EXTENSION cl_khr_fp64 : enable #define CacheLineSize 64 #define ScratchpadL3Mask64 (RANDOMX_SCRATCHPAD_L3 - CacheLineSize) #define CacheLineAlignMask ((RANDOMX_DATASET_BASE_SIZE - 1) & ~(CacheLineSize - 1)) #define mantissaSize 52 #define exponentSize 11 #define mantissaMask ((1UL << mantissaSize) - 1) #define exponentMask ((1UL << exponentSize) - 1) #define exponentBias 1023 #define constExponentBits 0x300 #define dynamicExponentBits 4 #define staticExponentBits 4 #define dynamicMantissaMask ((1UL << (mantissaSize + dynamicExponentBits)) - 1) #define RegistersCount 8 #define RegisterCountFlt (RegistersCount / 2) #define ConditionMask ((1 << RANDOMX_JUMP_BITS) - 1) #define ConditionOffset RANDOMX_JUMP_OFFSET #define StoreL3Condition 14 #define DatasetExtraItems (RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE) #define RegisterNeedsDisplacement 5 // // VM state: // // Bytes 0-255: registers // Bytes 256-1023: imm32 values (up to 192 values can be stored). IMUL_RCP and CBRANCH use 2 consecutive imm32 values. // Bytes 1024-2047: up to 256 instructions // // Instruction encoding: // // Bits 0-2: dst (0-7) // Bits 3-5: src (0-7) // Bits 6-13: imm32/64 offset (in DWORDs, 0-191) // Bit 14: src location (register, scratchpad) // Bits 15-16: src shift (0-3), ADD/MUL switch for FMA instruction // Bit 17: src=imm32 // Bit 18: src=imm64 // Bit 19: src = -src // Bits 20-23: opcode (add_rs, add, mul, umul_hi, imul_hi, neg, xor, ror, swap, cbranch, store, fswap, fma, fsqrt, fdiv, cfround) // Bits 24-27: how many parallel instructions to run starting with this one (1-16) // Bits 28-31: how many of them are FP instructions (0-8) // #define DST_OFFSET 0 #define SRC_OFFSET 3 #define IMM_OFFSET 6 #define LOC_OFFSET 14 #define SHIFT_OFFSET 15 #define SRC_IS_IMM32_OFFSET 17 #define SRC_IS_IMM64_OFFSET 18 #define NEGATIVE_SRC_OFFSET 19 #define OPCODE_OFFSET 20 #define NUM_INSTS_OFFSET 24 #define NUM_FP_INSTS_OFFSET 28 // ISWAP r0, r0 #define INST_NOP (8 << OPCODE_OFFSET) typedef uchar uint8_t; typedef ushort uint16_t; typedef uint uint32_t; typedef ulong uint64_t; typedef int int32_t; typedef long int64_t; double getSmallPositiveFloatBits(uint64_t entropy) { uint64_t exponent = entropy >> 59; //0..31 uint64_t mantissa = entropy & mantissaMask; exponent += exponentBias; exponent &= exponentMask; exponent <<= mantissaSize; return as_double(exponent | mantissa); } uint64_t getStaticExponent(uint64_t entropy) { uint64_t exponent = constExponentBits; exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits; exponent <<= mantissaSize; return exponent; } uint64_t getFloatMask(uint64_t entropy) { const uint64_t mask22bit = (1UL << 22) - 1; return (entropy & mask22bit) | getStaticExponent(entropy); } void set_buffer(__local uint32_t *dst_buf, uint32_t N, const uint32_t value) { uint32_t i = get_local_id(0) * sizeof(uint32_t); const uint32_t step = get_local_size(0) * sizeof(uint32_t); __local uint8_t* dst = ((__local uint8_t*)dst_buf) + i; while (i < sizeof(uint32_t) * N) { *(__local uint32_t*)(dst) = value; dst += step; i += step; } } uint64_t imul_rcp_value(uint32_t divisor) { if ((divisor & (divisor - 1)) == 0) { return 1UL; } const uint64_t p2exp63 = 1UL << 63; uint64_t quotient = p2exp63 / divisor; uint64_t remainder = p2exp63 % divisor; const uint32_t bsr = 31 - clz(divisor); for (uint32_t shift = 0; shift <= bsr; ++shift) { const bool b = (remainder >= divisor - remainder); quotient = (quotient << 1) | (b ? 1 : 0); remainder = (remainder << 1) - (b ? divisor : 0); } return quotient; } #define set_byte(a, position, value) do { ((uint8_t*)&(a))[(position)] = (value); } while (0) uint32_t get_byte(uint64_t a, uint32_t position) { return (a >> (position << 3)) & 0xFF; } #define update_max(value, next_value) do { if ((value) < (next_value)) (value) = (next_value); } while (0) __attribute__((reqd_work_group_size(32, 1, 1))) __kernel void init_vm(__global const void* entropy_data, __global void* vm_states, __global uint32_t* rounding, uint32_t iteration) { #if RANDOMX_PROGRAM_SIZE <= 256 typedef uint8_t exec_t; #else typedef uint16_t exec_t; #endif __local uint32_t execution_plan_buf[RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH * (32 / 8) * sizeof(exec_t) / sizeof(uint32_t)]; set_buffer(execution_plan_buf, sizeof(execution_plan_buf) / sizeof(uint32_t), 0); barrier(CLK_LOCAL_MEM_FENCE); const uint32_t global_index = get_global_id(0); const uint32_t idx = global_index / 8; const uint32_t sub = global_index % 8; __local exec_t* execution_plan = (__local exec_t*)(execution_plan_buf + (get_local_id(0) / 8) * RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH * sizeof(exec_t) / sizeof(uint32_t)); __global uint64_t* R = ((__global uint64_t*)vm_states) + idx * VM_STATE_SIZE / sizeof(uint64_t); R[sub] = 0; const __global uint64_t* entropy = ((const __global uint64_t*)entropy_data) + idx * ENTROPY_SIZE / sizeof(uint64_t); __global double* A = (__global double*)(R + 24); A[sub] = getSmallPositiveFloatBits(entropy[sub]); if (sub == 0) { if (iteration == 0) rounding[idx] = 0; __global uint2* src_program = (__global uint2*)(entropy + 128 / sizeof(uint64_t)); #if RANDOMX_PROGRAM_SIZE <= 256 uint64_t registerLastChanged = 0; uint64_t registerWasChanged = 0; #else int32_t registerLastChanged[8] = { -1, -1, -1, -1, -1, -1, -1, -1 }; #endif // Initialize CBRANCH instructions for (uint32_t i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { // Clear all src flags (branch target, FP, branch) *(__global uint32_t*)(src_program + i) &= ~(0xF8U << 8); const uint2 src_inst = src_program[i]; uint2 inst = src_inst; uint32_t opcode = inst.x & 0xff; const uint32_t dst = (inst.x >> 8) & 7; const uint32_t src = (inst.x >> 16) & 7; if (opcode < RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M) { #if RANDOMX_PROGRAM_SIZE <= 256 set_byte(registerLastChanged, dst, i); set_byte(registerWasChanged, dst, 1); #else registerLastChanged[dst] = i; #endif continue; } opcode -= RANDOMX_FREQ_IADD_RS + RANDOMX_FREQ_IADD_M + RANDOMX_FREQ_ISUB_R + RANDOMX_FREQ_ISUB_M + RANDOMX_FREQ_IMUL_R + RANDOMX_FREQ_IMUL_M + RANDOMX_FREQ_IMULH_R + RANDOMX_FREQ_IMULH_M + RANDOMX_FREQ_ISMULH_R + RANDOMX_FREQ_ISMULH_M; if (opcode < RANDOMX_FREQ_IMUL_RCP) { if (inst.y & (inst.y - 1)) { #if RANDOMX_PROGRAM_SIZE <= 256 set_byte(registerLastChanged, dst, i); set_byte(registerWasChanged, dst, 1); #else registerLastChanged[dst] = i; #endif } continue; } opcode -= RANDOMX_FREQ_IMUL_RCP; if (opcode < RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R) { #if RANDOMX_PROGRAM_SIZE <= 256 set_byte(registerLastChanged, dst, i); set_byte(registerWasChanged, dst, 1); #else registerLastChanged[dst] = i; #endif continue; } opcode -= RANDOMX_FREQ_INEG_R + RANDOMX_FREQ_IXOR_R + RANDOMX_FREQ_IXOR_M + RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R; if (opcode < RANDOMX_FREQ_ISWAP_R) { if (src != dst) { #if RANDOMX_PROGRAM_SIZE <= 256 set_byte(registerLastChanged, dst, i); set_byte(registerWasChanged, dst, 1); set_byte(registerLastChanged, src, i); set_byte(registerWasChanged, src, 1); #else registerLastChanged[dst] = i; registerLastChanged[src] = i; #endif } continue; } opcode -= RANDOMX_FREQ_ISWAP_R; if (opcode < RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R) { // Mark FP instruction (src |= 0x20) *(__global uint32_t*)(src_program + i) |= 0x20 << 8; continue; } opcode -= RANDOMX_FREQ_FSWAP_R + RANDOMX_FREQ_FADD_R + RANDOMX_FREQ_FADD_M + RANDOMX_FREQ_FSUB_R + RANDOMX_FREQ_FSUB_M + RANDOMX_FREQ_FSCAL_R + RANDOMX_FREQ_FMUL_R + RANDOMX_FREQ_FDIV_M + RANDOMX_FREQ_FSQRT_R; if (opcode < RANDOMX_FREQ_CBRANCH) { const uint32_t creg = dst; #if RANDOMX_PROGRAM_SIZE <= 256 const uint32_t change = get_byte(registerLastChanged, dst); const int32_t lastChanged = (get_byte(registerWasChanged, dst) == 0) ? -1 : (int32_t)(change); // Store condition register and branch target in CBRANCH instruction *(__global uint32_t*)(src_program + i) = (src_inst.x & 0xFF0000FFU) | ((creg | ((lastChanged == -1) ? 0x90 : 0x10)) << 8) | (((uint32_t)(lastChanged) & 0xFF) << 16); #else const int32_t lastChanged = registerLastChanged[dst]; // Store condition register in CBRANCH instruction *(__global uint32_t*)(src_program + i) = (src_inst.x & 0xFF0000FFU) | ((creg | 0x10) << 8); #endif // Mark branch target instruction (src |= 0x40) *(__global uint32_t*)(src_program + lastChanged + 1) |= 0x40 << 8; #if RANDOMX_PROGRAM_SIZE <= 256 uint32_t tmp = i | (i << 8); registerLastChanged = tmp | (tmp << 16); registerLastChanged = registerLastChanged | (registerLastChanged << 32); registerWasChanged = 0x0101010101010101UL; #else registerLastChanged[0] = i; registerLastChanged[1] = i; registerLastChanged[2] = i; registerLastChanged[3] = i; registerLastChanged[4] = i; registerLastChanged[5] = i; registerLastChanged[6] = i; registerLastChanged[7] = i; #endif } } uint64_t registerLatency = 0; uint64_t registerReadCycle = 0; uint64_t registerLatencyFP = 0; uint64_t registerReadCycleFP = 0; uint32_t ScratchpadHighLatency = 0; volatile uint32_t ScratchpadLatency = 0; int32_t first_available_slot = 0; int32_t first_allowed_slot_cfround = 0; int32_t last_used_slot = -1; int32_t last_memory_op_slot = -1; uint32_t num_slots_used = 0; uint32_t num_instructions = 0; int32_t first_instruction_slot = -1; bool first_instruction_fp = false; //if (global_index == 0) //{ // for (int j = 0; j < RANDOMX_PROGRAM_SIZE; ++j) // { // print_inst(src_program[j]); // printf("\n"); // } // printf("\n"); //} // Schedule instructions bool update_branch_target_mark = false; bool first_available_slot_is_branch_target = false; for (uint32_t i = 0; i < RANDOMX_PROGRAM_SIZE; ++i) { const uint2 inst = src_program[i]; uint32_t opcode = inst.x & 0xff; uint32_t dst = (inst.x >> 8) & 7; const uint32_t src = (inst.x >> 16) & 7; const uint32_t mod = (inst.x >> 24); bool is_branch_target = (inst.x & (0x40 << 8)) != 0; if (is_branch_target) { // If an instruction is a branch target, we can't move it before any previous instructions first_available_slot = last_used_slot + 1; // Mark this slot as a branch target // Whatever instruction takes this slot will receive branch target flag first_available_slot_is_branch_target = true; } const uint32_t dst_latency = get_byte(registerLatency, dst); const uint32_t src_latency = get_byte(registerLatency, src); const uint32_t reg_read_latency = (dst_latency > src_latency) ? dst_latency : src_latency; const uint32_t mem_read_latency = ((dst == src) && ((inst.y & ScratchpadL3Mask64) >= RANDOMX_SCRATCHPAD_L2)) ? ScratchpadHighLatency : ScratchpadLatency; uint32_t full_read_latency = mem_read_latency; update_max(full_read_latency, reg_read_latency); uint32_t latency = 0; bool is_memory_op = false; bool is_memory_store = false; bool is_nop = false; bool is_branch = false; bool is_swap = false; bool is_src_read = true; bool is_fp = false; bool is_cfround = false; do { if (opcode < RANDOMX_FREQ_IADD_RS) { latency = reg_read_latency; break; } opcode -= RANDOMX_FREQ_IADD_RS; if (opcode < RANDOMX_FREQ_IADD_M) { latency = full_read_latency; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_IADD_M; if (opcode < RANDOMX_FREQ_ISUB_R) { latency = reg_read_latency; break; } opcode -= RANDOMX_FREQ_ISUB_R; if (opcode < RANDOMX_FREQ_ISUB_M) { latency = full_read_latency; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_ISUB_M; if (opcode < RANDOMX_FREQ_IMUL_R) { latency = reg_read_latency; break; } opcode -= RANDOMX_FREQ_IMUL_R; if (opcode < RANDOMX_FREQ_IMUL_M) { latency = full_read_latency; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_IMUL_M; if (opcode < RANDOMX_FREQ_IMULH_R) { latency = reg_read_latency; break; } opcode -= RANDOMX_FREQ_IMULH_R; if (opcode < RANDOMX_FREQ_IMULH_M) { latency = full_read_latency; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_IMULH_M; if (opcode < RANDOMX_FREQ_ISMULH_R) { latency = reg_read_latency; break; } opcode -= RANDOMX_FREQ_ISMULH_R; if (opcode < RANDOMX_FREQ_ISMULH_M) { latency = full_read_latency; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_ISMULH_M; if (opcode < RANDOMX_FREQ_IMUL_RCP) { is_src_read = false; if (inst.y & (inst.y - 1)) latency = dst_latency; else is_nop = true; break; } opcode -= RANDOMX_FREQ_IMUL_RCP; if (opcode < RANDOMX_FREQ_INEG_R) { is_src_read = false; latency = dst_latency; break; } opcode -= RANDOMX_FREQ_INEG_R; if (opcode < RANDOMX_FREQ_IXOR_R) { latency = reg_read_latency; break; } opcode -= RANDOMX_FREQ_IXOR_R; if (opcode < RANDOMX_FREQ_IXOR_M) { latency = full_read_latency; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_IXOR_M; if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R) { latency = reg_read_latency; break; } opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R; if (opcode < RANDOMX_FREQ_ISWAP_R) { is_swap = true; if (dst != src) latency = reg_read_latency; else is_nop = true; break; } opcode -= RANDOMX_FREQ_ISWAP_R; if (opcode < RANDOMX_FREQ_FSWAP_R) { latency = get_byte(registerLatencyFP, dst); is_fp = true; is_src_read = false; break; } opcode -= RANDOMX_FREQ_FSWAP_R; if (opcode < RANDOMX_FREQ_FADD_R) { dst %= RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); is_fp = true; is_src_read = false; break; } opcode -= RANDOMX_FREQ_FADD_R; if (opcode < RANDOMX_FREQ_FADD_M) { dst %= RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); update_max(latency, src_latency); update_max(latency, ScratchpadLatency); is_fp = true; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_FADD_M; if (opcode < RANDOMX_FREQ_FSUB_R) { dst %= RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); is_fp = true; is_src_read = false; break; } opcode -= RANDOMX_FREQ_FSUB_R; if (opcode < RANDOMX_FREQ_FSUB_M) { dst %= RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); update_max(latency, src_latency); update_max(latency, ScratchpadLatency); is_fp = true; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_FSUB_M; if (opcode < RANDOMX_FREQ_FSCAL_R) { dst %= RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); is_fp = true; is_src_read = false; break; } opcode -= RANDOMX_FREQ_FSCAL_R; if (opcode < RANDOMX_FREQ_FMUL_R) { dst = (dst % RegisterCountFlt) + RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); is_fp = true; is_src_read = false; break; } opcode -= RANDOMX_FREQ_FMUL_R; if (opcode < RANDOMX_FREQ_FDIV_M) { dst = (dst % RegisterCountFlt) + RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); update_max(latency, src_latency); update_max(latency, ScratchpadLatency); is_fp = true; is_memory_op = true; break; } opcode -= RANDOMX_FREQ_FDIV_M; if (opcode < RANDOMX_FREQ_FSQRT_R) { dst = (dst % RegisterCountFlt) + RegisterCountFlt; latency = get_byte(registerLatencyFP, dst); is_fp = true; is_src_read = false; break; } opcode -= RANDOMX_FREQ_FSQRT_R; if (opcode < RANDOMX_FREQ_CBRANCH) { is_src_read = false; is_branch = true; latency = dst_latency; // We can't move CBRANCH before any previous instructions first_available_slot = last_used_slot + 1; break; } opcode -= RANDOMX_FREQ_CBRANCH; if (opcode < RANDOMX_FREQ_CFROUND) { latency = src_latency; is_cfround = true; break; } opcode -= RANDOMX_FREQ_CFROUND; if (opcode < RANDOMX_FREQ_ISTORE) { latency = reg_read_latency; update_max(latency, (last_memory_op_slot + WORKERS_PER_HASH) / WORKERS_PER_HASH); is_memory_op = true; is_memory_store = true; break; } opcode -= RANDOMX_FREQ_ISTORE; is_nop = true; } while (false); if (is_nop) { if (is_branch_target) { // Mark next non-NOP instruction as the branch target instead of this NOP update_branch_target_mark = true; } continue; } if (update_branch_target_mark) { *(__global uint32_t*)(src_program + i) |= 0x40 << 8; update_branch_target_mark = false; is_branch_target = true; } int32_t first_allowed_slot = first_available_slot; update_max(first_allowed_slot, latency * WORKERS_PER_HASH); if (is_cfround) update_max(first_allowed_slot, first_allowed_slot_cfround); else update_max(first_allowed_slot, get_byte(is_fp ? registerReadCycleFP : registerReadCycle, dst) * WORKERS_PER_HASH); if (is_swap) update_max(first_allowed_slot, get_byte(registerReadCycle, src) * WORKERS_PER_HASH); int32_t slot_to_use = last_used_slot + 1; update_max(slot_to_use, first_allowed_slot); if (is_fp) { slot_to_use = -1; for (int32_t j = first_allowed_slot; slot_to_use < 0; ++j) { if ((execution_plan[j] == 0) && (execution_plan[j + 1] == 0) && ((j + 1) % WORKERS_PER_HASH)) { bool blocked = false; for (int32_t k = (j / WORKERS_PER_HASH) * WORKERS_PER_HASH; k < j; ++k) { if (execution_plan[k] || (k == first_instruction_slot)) { const uint32_t inst = src_program[execution_plan[k]].x; // If there is an integer instruction which is a branch target or a branch, or this FP instruction is a branch target itself, we can't reorder it to add more FP instructions to this cycle if (((inst & (0x20 << 8)) == 0) && (((inst & (0x50 << 8)) != 0) || is_branch_target)) { blocked = true; continue; } } } if (!blocked) { for (int32_t k = (j / WORKERS_PER_HASH) * WORKERS_PER_HASH; k < j; ++k) { if (execution_plan[k] || (k == first_instruction_slot)) { const uint32_t inst = src_program[execution_plan[k]].x; if ((inst & (0x20 << 8)) == 0) { execution_plan[j] = execution_plan[k]; execution_plan[j + 1] = execution_plan[k + 1]; if (first_instruction_slot == k) first_instruction_slot = j; if (first_instruction_slot == k + 1) first_instruction_slot = j + 1; slot_to_use = k; break; } } } if (slot_to_use < 0) { slot_to_use = j; } break; } } } } else { for (int32_t j = first_allowed_slot; j <= last_used_slot; ++j) { if (execution_plan[j] == 0) { slot_to_use = j; break; } } } if (i == 0) { first_instruction_slot = slot_to_use; first_instruction_fp = is_fp; } if (is_cfround) { first_allowed_slot_cfround = slot_to_use - (slot_to_use % WORKERS_PER_HASH) + WORKERS_PER_HASH; } ++num_instructions; execution_plan[slot_to_use] = i; ++num_slots_used; if (is_fp) { execution_plan[slot_to_use + 1] = i; ++num_slots_used; } const uint32_t next_latency = (slot_to_use / WORKERS_PER_HASH) + 1; if (is_src_read) { int32_t value = get_byte(registerReadCycle, src); update_max(value, slot_to_use / WORKERS_PER_HASH); set_byte(registerReadCycle, src, value); } if (is_memory_op) { update_max(last_memory_op_slot, slot_to_use); } if (is_cfround) { const uint32_t t = next_latency | (next_latency << 8); registerLatencyFP = t | (t << 16); registerLatencyFP = registerLatencyFP | (registerLatencyFP << 32); } else if (is_fp) { set_byte(registerLatencyFP, dst, next_latency); int32_t value = get_byte(registerReadCycleFP, dst); update_max(value, slot_to_use / WORKERS_PER_HASH); set_byte(registerReadCycleFP, dst, value); } else { if (!is_memory_store && !is_nop) { set_byte(registerLatency, dst, next_latency); if (is_swap) set_byte(registerLatency, src, next_latency); int32_t value = get_byte(registerReadCycle, dst); update_max(value, slot_to_use / WORKERS_PER_HASH); set_byte(registerReadCycle, dst, value); } if (is_branch) { const uint32_t t = next_latency | (next_latency << 8); registerLatency = t | (t << 16); registerLatency = registerLatency | (registerLatency << 32); } if (is_memory_store) { int32_t value = get_byte(registerReadCycle, dst); update_max(value, slot_to_use / WORKERS_PER_HASH); set_byte(registerReadCycle, dst, value); ScratchpadLatency = (slot_to_use / WORKERS_PER_HASH) + 1; if ((mod >> 4) >= StoreL3Condition) ScratchpadHighLatency = (slot_to_use / WORKERS_PER_HASH) + 1; } } if (execution_plan[first_available_slot] || (first_available_slot == first_instruction_slot)) { if (first_available_slot_is_branch_target) { src_program[i].x |= 0x40 << 8; first_available_slot_is_branch_target = false; } if (is_fp) ++first_available_slot; do { ++first_available_slot; } while ((first_available_slot < RANDOMX_PROGRAM_SIZE * WORKERS_PER_HASH) && (execution_plan[first_available_slot] != 0)); } if (is_branch_target) { update_max(first_available_slot, is_fp ? (slot_to_use + 2) : (slot_to_use + 1)); } update_max(last_used_slot, is_fp ? (slot_to_use + 1) : slot_to_use); while (execution_plan[last_used_slot] || (last_used_slot == first_instruction_slot) || ((last_used_slot == first_instruction_slot + 1) && first_instruction_fp)) { ++last_used_slot; } --last_used_slot; if (is_fp && (last_used_slot >= first_allowed_slot_cfround)) first_allowed_slot_cfround = last_used_slot + 1; //if (global_index == 0) //{ // printf("slot_to_use = %d, first_available_slot = %d, last_used_slot = %d\n", slot_to_use, first_available_slot, last_used_slot); // for (int j = 0; j <= last_used_slot; ++j) // { // if (execution_plan[j] || (j == first_instruction_slot) || ((j == first_instruction_slot + 1) && first_instruction_fp)) // { // print_inst(src_program[execution_plan[j]]); // printf(" | "); // } // else // { // printf(" | "); // } // if (((j + 1) % WORKERS_PER_HASH) == 0) printf("\n"); // } // printf("\n\n"); //} } //if (global_index == 0) //{ // printf("IPC = %.3f, WPC = %.3f, num_instructions = %u, num_slots_used = %u, first_instruction_slot = %d, last_used_slot = %d, registerLatency = %016llx, registerLatencyFP = %016llx \n", // num_instructions / static_cast(last_used_slot / WORKERS_PER_HASH + 1), // num_slots_used / static_cast(last_used_slot / WORKERS_PER_HASH + 1), // num_instructions, // num_slots_used, // first_instruction_slot, // last_used_slot, // registerLatency, // registerLatencyFP // ); // //for (int j = 0; j < RANDOMX_PROGRAM_SIZE; ++j) // //{ // // print_inst(src_program[j]); // // printf("\n"); // //} // //printf("\n"); // for (int j = 0; j <= last_used_slot; ++j) // { // if (execution_plan[j] || (j == first_instruction_slot) || ((j == first_instruction_slot + 1) && first_instruction_fp)) // { // print_inst(src_program[execution_plan[j]]); // printf(" | "); // } // else // { // printf(" | "); // } // if (((j + 1) % WORKERS_PER_HASH) == 0) printf("\n"); // } // printf("\n\n"); //} //atomicAdd((uint32_t*)num_vm_cycles, (last_used_slot / WORKERS_PER_HASH) + 1); //atomicAdd((uint32_t*)(num_vm_cycles) + 1, num_slots_used); uint32_t ma = (uint32_t)(entropy[8]) & CacheLineAlignMask; uint32_t mx = (uint32_t)(entropy[10]) & CacheLineAlignMask; uint32_t addressRegisters = (uint32_t)(entropy[12]); addressRegisters = ((addressRegisters & 1) | (((addressRegisters & 2) ? 3U : 2U) << 8) | (((addressRegisters & 4) ? 5U : 4U) << 16) | (((addressRegisters & 8) ? 7U : 6U) << 24)) * sizeof(uint64_t); uint32_t datasetOffset = (entropy[13] & DatasetExtraItems) * CacheLineSize; ulong2 eMask = *(__global ulong2*)(entropy + 14); eMask.x = getFloatMask(eMask.x); eMask.y = getFloatMask(eMask.y); ((__global uint32_t*)(R + 16))[0] = ma; ((__global uint32_t*)(R + 16))[1] = mx; ((__global uint32_t*)(R + 16))[2] = addressRegisters; ((__global uint32_t*)(R + 16))[3] = datasetOffset; ((__global ulong2*)(R + 18))[0] = eMask; __global uint32_t* imm_buf = (__global uint32_t*)(R + REGISTERS_SIZE / sizeof(uint64_t)); uint32_t imm_index = 0; int32_t imm_index_fscal_r = -1; __global uint32_t* compiled_program = (__global uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t)); // Generate opcodes for execute_vm int32_t branch_target_slot = -1; int32_t k = -1; for (int32_t i = 0; i <= last_used_slot; ++i) { if (!(execution_plan[i] || (i == first_instruction_slot) || ((i == first_instruction_slot + 1) && first_instruction_fp))) continue; uint32_t num_workers = 1; uint32_t num_fp_insts = 0; while ((i + num_workers <= last_used_slot) && ((i + num_workers) % WORKERS_PER_HASH) && (execution_plan[i + num_workers] || (i + num_workers == first_instruction_slot) || ((i + num_workers == first_instruction_slot + 1) && first_instruction_fp))) { if ((num_workers & 1) && ((src_program[execution_plan[i + num_workers]].x & (0x20 << 8)) != 0)) ++num_fp_insts; ++num_workers; } //if (global_index == 0) // printf("i = %d, num_workers = %u, num_fp_insts = %u\n", i, num_workers, num_fp_insts); num_workers = ((num_workers - 1) << NUM_INSTS_OFFSET) | (num_fp_insts << NUM_FP_INSTS_OFFSET); const uint2 src_inst = src_program[execution_plan[i]]; uint2 inst = src_inst; uint32_t opcode = inst.x & 0xff; const uint32_t dst = (inst.x >> 8) & 7; const uint32_t src = (inst.x >> 16) & 7; const uint32_t mod = (inst.x >> 24); const bool is_fp = (src_inst.x & (0x20 << 8)) != 0; if (is_fp && ((i & 1) == 0)) ++i; const bool is_branch_target = (src_inst.x & (0x40 << 8)) != 0; if (is_branch_target && (branch_target_slot < 0)) branch_target_slot = k; ++k; inst.x = INST_NOP; if (opcode < RANDOMX_FREQ_IADD_RS) { const uint32_t shift = (mod >> 2) % 4; inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (shift << SHIFT_OFFSET); if (dst != RegisterNeedsDisplacement) { // Encode regular ADD (opcode 1) inst.x |= (1 << OPCODE_OFFSET); } else { // Encode ADD with src and imm32 (opcode 0) inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = inst.y; } *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IADD_RS; if (opcode < RANDOMX_FREQ_IADD_M) { const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (1 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IADD_M; if (opcode < RANDOMX_FREQ_ISUB_R) { inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); if (src == dst) { inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = inst.y; } *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_ISUB_R; if (opcode < RANDOMX_FREQ_ISUB_M) { const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (1 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_ISUB_M; if (opcode < RANDOMX_FREQ_IMUL_R) { inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (2 << OPCODE_OFFSET); if (src == dst) { inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = inst.y; } *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IMUL_R; if (opcode < RANDOMX_FREQ_IMUL_M) { const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (2 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IMUL_M; if (opcode < RANDOMX_FREQ_IMULH_R) { inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (6 << OPCODE_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IMULH_R; if (opcode < RANDOMX_FREQ_IMULH_M) { const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (6 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IMULH_M; if (opcode < RANDOMX_FREQ_ISMULH_R) { inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (4 << OPCODE_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_ISMULH_R; if (opcode < RANDOMX_FREQ_ISMULH_M) { const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (4 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_ISMULH_M; if (opcode < RANDOMX_FREQ_IMUL_RCP) { const uint64_t r = imul_rcp_value(inst.y); if (r == 1) { *(compiled_program++) = INST_NOP | num_workers; continue; } inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (2 << OPCODE_OFFSET); inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM64_OFFSET); if (imm_index < IMM_INDEX_COUNT - 1) { imm_buf[imm_index] = ((const uint32_t*)&r)[0]; imm_buf[imm_index + 1] = ((const uint32_t*)&r)[1]; imm_index += 2; } *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IMUL_RCP; if (opcode < RANDOMX_FREQ_INEG_R) { inst.x = (dst << DST_OFFSET) | (5 << OPCODE_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_INEG_R; if (opcode < RANDOMX_FREQ_IXOR_R) { inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (3 << OPCODE_OFFSET); if (src == dst) { inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = inst.y; } *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IXOR_R; if (opcode < RANDOMX_FREQ_IXOR_M) { const uint32_t location = (src == dst) ? 3 : ((mod % 4) ? 1 : 2); inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (3 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IXOR_M; if (opcode < RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R) { inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (7 << OPCODE_OFFSET); if (src == dst) { inst.x |= (imm_index << IMM_OFFSET) | (1 << SRC_IS_IMM32_OFFSET); if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = inst.y; } if (opcode >= RANDOMX_FREQ_IROR_R) { inst.x |= (1 << NEGATIVE_SRC_OFFSET); } *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_IROR_R + RANDOMX_FREQ_IROL_R; if (opcode < RANDOMX_FREQ_ISWAP_R) { inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (8 << OPCODE_OFFSET); *(compiled_program++) = ((src != dst) ? inst.x : INST_NOP) | num_workers; continue; } opcode -= RANDOMX_FREQ_ISWAP_R; if (opcode < RANDOMX_FREQ_FSWAP_R) { inst.x = (dst << DST_OFFSET) | (11 << OPCODE_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FSWAP_R; if (opcode < RANDOMX_FREQ_FADD_R) { inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (12 << OPCODE_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FADD_R; if (opcode < RANDOMX_FREQ_FADD_M) { const uint32_t location = (mod % 4) ? 1 : 2; inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (12 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FADD_M; if (opcode < RANDOMX_FREQ_FSUB_R) { inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (12 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FSUB_R; if (opcode < RANDOMX_FREQ_FSUB_M) { const uint32_t location = (mod % 4) ? 1 : 2; inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (12 << OPCODE_OFFSET) | (1 << NEGATIVE_SRC_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FSUB_M; if (opcode < RANDOMX_FREQ_FSCAL_R) { inst.x = ((dst % RegisterCountFlt) << DST_OFFSET) | (1 << SRC_IS_IMM64_OFFSET) | (3 << OPCODE_OFFSET); if (imm_index_fscal_r >= 0) { inst.x |= (imm_index_fscal_r << IMM_OFFSET); } else { imm_index_fscal_r = imm_index; inst.x |= (imm_index << IMM_OFFSET); if (imm_index < IMM_INDEX_COUNT - 1) { imm_buf[imm_index] = 0; imm_buf[imm_index + 1] = 0x80F00000UL; imm_index += 2; } } *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FSCAL_R; if (opcode < RANDOMX_FREQ_FMUL_R) { inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | ((src % RegisterCountFlt) << (SRC_OFFSET + 1)) | (1 << SHIFT_OFFSET) | (12 << OPCODE_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FMUL_R; if (opcode < RANDOMX_FREQ_FDIV_M) { const uint32_t location = (mod % 4) ? 1 : 2; inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (15 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FDIV_M; if (opcode < RANDOMX_FREQ_FSQRT_R) { inst.x = (((dst % RegisterCountFlt) + RegisterCountFlt) << DST_OFFSET) | (14 << OPCODE_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_FSQRT_R; if (opcode < RANDOMX_FREQ_CBRANCH) { inst.x = (dst << DST_OFFSET) | (9 << OPCODE_OFFSET); inst.x |= (imm_index << IMM_OFFSET); const uint32_t cshift = (mod >> 4) + ConditionOffset; uint32_t imm = inst.y | (1U << cshift); if (cshift > 0) imm &= ~(1U << (cshift - 1)); if (imm_index < IMM_INDEX_COUNT - 1) { imm_buf[imm_index] = imm; imm_buf[imm_index + 1] = cshift | ((uint32_t)(branch_target_slot) << 5); imm_index += 2; } else { // Data doesn't fit, skip it inst.x = INST_NOP; } branch_target_slot = -1; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_CBRANCH; if (opcode < RANDOMX_FREQ_CFROUND) { inst.x = (src << SRC_OFFSET) | (13 << OPCODE_OFFSET) | ((inst.y & 63) << IMM_OFFSET); *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_CFROUND; if (opcode < RANDOMX_FREQ_ISTORE) { const uint32_t location = ((mod >> 4) >= StoreL3Condition) ? 3 : ((mod % 4) ? 1 : 2); inst.x = (dst << DST_OFFSET) | (src << SRC_OFFSET) | (1 << LOC_OFFSET) | (10 << OPCODE_OFFSET); inst.x |= imm_index << IMM_OFFSET; if (imm_index < IMM_INDEX_COUNT) imm_buf[imm_index++] = (inst.y & 0xFC1FFFFFU) | (((location == 1) ? LOC_L1 : ((location == 2) ? LOC_L2 : LOC_L3)) << 21); else inst.x = INST_NOP; *(compiled_program++) = inst.x | num_workers; continue; } opcode -= RANDOMX_FREQ_ISTORE; *(compiled_program++) = inst.x | num_workers; } ((__global uint32_t*)(R + 20))[0] = (uint32_t)(compiled_program - (__global uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t))); } } void load_buffer(__local uint64_t *dst_buf, size_t N, __global const void* src_buf) { uint32_t i = get_local_id(0) * sizeof(uint64_t); const uint32_t step = get_local_size(0) * sizeof(uint64_t); __global const uint8_t* src = ((__global const uint8_t*)src_buf) + get_group_id(0) * sizeof(uint64_t) * N + i; __local uint8_t* dst = ((__local uint8_t*)dst_buf) + i; while (i < sizeof(uint64_t) * N) { *(__local uint64_t*)(dst) = *(__global uint64_t*)(src); src += step; dst += step; i += step; } } double load_F_E_groups(int value, uint64_t andMask, uint64_t orMask) { double t = convert_double_rtn(value); uint64_t x = as_ulong(t); x &= andMask; x |= orMask; return as_double(x); } // You're one ugly motherfucker! double fma_soft(double a, double b, double c, uint32_t rounding_mode) { if (rounding_mode == 0) return fma(a, b, c); if ((a == 0.0) || (b == 0.0)) return c; if (b == 1.0) { if (c == 0.0) return a; if (c == -a) { const uint64_t minus_zero = 1UL << 63; return (rounding_mode == 1) ? as_double(minus_zero) : 0.0; } } const uint64_t mantissa_size = 52; const uint64_t mantissa_mask = (1UL << 52) - 1; const uint64_t exponent_size = 11; const uint64_t exponent_mask = (1 << exponent_size) - 1; uint2 a2 = as_uint2(a); uint2 b2 = as_uint2(b); uint2 c2 = as_uint2(c); const uint32_t exponent_a = (a2.y >> 20) & exponent_mask; const uint32_t exponent_b = (b2.y >> 20) & exponent_mask; const uint32_t exponent_c = (c2.y >> 20) & exponent_mask; if ((exponent_a == 2047) || (exponent_b == 2047) || (exponent_c == 2047)) { const uint64_t inf = 2047UL << 52; return as_double(inf); } const uint32_t sign_a = a2.y >> 31; const uint32_t sign_b = b2.y >> 31; const uint32_t sign_c = c2.y >> 31; a2.y = (a2.y & ((1U << 20) - 1)) | (1U << 20); b2.y = (b2.y & ((1U << 20) - 1)) | (1U << 20); c2.y = (c2.y & ((1U << 20) - 1)) | (1U << 20); uint64_t mantissa_a = as_ulong(a2); uint64_t mantissa_b = as_ulong(b2); uint64_t mantissa_c = as_ulong(c2); uint64_t mul_result[2]; mul_result[0] = mantissa_a * mantissa_b; mul_result[1] = mul_hi(mantissa_a, mantissa_b); uint32_t exp_correction = mul_result[1] >> 41; uint32_t exponent_mul_result = exponent_a + exponent_b + exp_correction - 1023; uint32_t sign_mul_result = sign_a ^ sign_b; if (exponent_mul_result >= 2047) { const uint64_t inf_rnd = (2047UL << 52) - (rounding_mode & 1); return as_double(inf_rnd); } uint64_t fma_result[2]; uint64_t t[2]; uint32_t exponent_fma_result; if (exponent_mul_result >= exponent_c) { uint32_t shift = 23 - exp_correction; fma_result[0] = mul_result[0] << shift; fma_result[1] = (mul_result[1] << shift) | (mul_result[0] >> (64 - shift)); int32_t shift2 = (127 - 52) + (int32_t)(exponent_c - exponent_mul_result); if (shift2 >= 0) { if (shift2 >= 64) { t[0] = 0; t[1] = mantissa_c << (shift2 - 64); } else { t[0] = mantissa_c << shift2; t[1] = shift2 ? (mantissa_c >> (64 - shift2)) : 0; } } else { t[0] = (shift2 < -52) ? 0 : (mantissa_c >> (-shift2)); t[1] = 0; if ((t[0] == 0) && (c != 0.0)) t[0] = 1; } exponent_fma_result = exponent_mul_result; } else { t[0] = 0; t[1] = mantissa_c << 11; int32_t shift2 = (127 - 104 - exp_correction) + (int32_t)(exponent_mul_result - exponent_c); if (shift2 >= 0) { fma_result[0] = mul_result[0] << shift2; fma_result[1] = (mul_result[1] << shift2) | (shift2 ? (mul_result[0] >> (64 - shift2)) : 0); } else { shift2 = -shift2; if (shift2 >= 64) { shift2 -= 64; fma_result[0] = (shift2 < 64) ? (mul_result[1] >> shift2) : 0; fma_result[1] = 0; if (fma_result[0] == 0) fma_result[0] = 1; } else { fma_result[0] = (mul_result[0] >> shift2) | (mul_result[1] << (64 - shift2)); fma_result[1] = mul_result[1] >> shift2; } } exponent_fma_result = exponent_c; } uint32_t sign_fma_result; if (sign_mul_result == sign_c) { fma_result[0] += t[0]; fma_result[1] += t[1] + ((fma_result[0] < t[0]) ? 1 : 0); exp_correction = (fma_result[1] < t[1]) ? 1 : 0; sign_fma_result = sign_mul_result; } else { const uint32_t borrow = (fma_result[0] < t[0]) ? 1 : 0; fma_result[0] -= t[0]; t[1] += borrow; const uint32_t change_sign = (fma_result[1] < t[1]) ? 1 : 0; fma_result[1] -= t[1]; sign_fma_result = sign_mul_result ^ change_sign; if (change_sign) { fma_result[0] = -(int64_t)(fma_result[0]); fma_result[1] = ~fma_result[1]; fma_result[1] += fma_result[0] ? 0 : 1; } if (fma_result[1] == 0) { if (fma_result[0] == 0) return 0.0; exponent_fma_result -= 64; fma_result[1] = fma_result[0]; fma_result[0] = 0; } const uint32_t index = clz(fma_result[1]); if (index) { exponent_fma_result -= index; fma_result[1] = (fma_result[1] << index) | (fma_result[0] >> (64 - index)); } exp_correction = 0; } const uint32_t shift = 11 + exp_correction; const uint32_t round_up = (fma_result[0] || (fma_result[1] & ((1 << shift) - 1))) ? 1 : 0; fma_result[1] >>= shift; fma_result[1] &= mantissa_mask; if (rounding_mode + sign_fma_result == 2) { fma_result[1] += round_up; if (fma_result[1] == (1UL << mantissa_size)) { fma_result[1] = 0; ++exponent_fma_result; } } fma_result[1] |= (uint64_t)(exponent_fma_result + exp_correction) << mantissa_size; fma_result[1] |= (uint64_t)(sign_fma_result) << 63; return as_double(fma_result[1]); } double div_rnd(double a, double b, uint32_t fprc) { double y0 = 1.0 / b; // Do 1 Newton-Raphson iteration to get correct rounding const double t0 = a * y0; const double t1 = fma(-b, t0, a); double result = fma_soft(y0, t1, t0, fprc); // Check for infinity/NaN const uint64_t inf = 2047UL << 52; const uint64_t inf_rnd = inf - (fprc & 1); if (((as_ulong(result) >> 52) & 2047) == 2047) result = as_double(inf_rnd); if (as_ulong(a) == inf) result = a; return (a == b) ? 1.0 : result; } double sqrt_rnd(double x, uint32_t fprc) { double y0 = rsqrt(x); // First Newton-Raphson iteration double t0 = y0 * x; double t1 = y0 * -0.5; t1 = fma(t1, t0, 0.5); // 0.5 * (1.0 - y0 * y0 * x) const double y1_x = fma(t0, t1, t0); // y1 * x = 0.5 * y0 * x * (3.0 - y0 * y0 * x) // Second Newton-Raphson iteration y0 *= 0.5; y0 = fma(y0, t1, y0); // 0.5 * y1 t1 = fma(-y1_x, y1_x, x); // x * (1.0 - x * y1 * y1) double result = fma_soft(t1, y0, y1_x, fprc); // x * 0.5 * y1 * (3.0 - x * y1 * y1) // Check for infinity if (*((uint64_t*) &x) == (2047UL << 52)) result = x; return result; } uint32_t inner_loop( const uint32_t program_length, __local const uint32_t* compiled_program, const int32_t sub, __global uint8_t* scratchpad, const uint32_t fp_reg_offset, const uint32_t fp_reg_group_A_offset, __local uint64_t* R, __local uint32_t* imm_buf, const uint32_t batch_size, uint32_t fprc, const uint32_t fp_workers_mask, const uint64_t xexponentMask, const uint32_t workers_mask ) { const int32_t sub2 = sub >> 1; imm_buf[IMM_INDEX_COUNT + 1] = fprc; #pragma unroll 1 for (int32_t ip = 0; ip < program_length;) { imm_buf[IMM_INDEX_COUNT] = ip; uint32_t inst = compiled_program[ip]; const int32_t num_workers = (inst >> NUM_INSTS_OFFSET) & (WORKERS_PER_HASH - 1); const int32_t num_fp_insts = (inst >> NUM_FP_INSTS_OFFSET) & (WORKERS_PER_HASH - 1); const int32_t num_insts = num_workers - num_fp_insts; if (sub <= num_workers) { const int32_t inst_offset = sub - num_fp_insts; const bool is_fp = inst_offset < num_fp_insts; inst = compiled_program[ip + (is_fp ? sub2 : inst_offset)]; //if ((idx == 0) && (ic == 0)) //{ // printf("num_fp_insts = %u, sub = %u, ip = %u, inst = %08x\n", num_fp_insts, sub, ip + ((sub < num_fp_insts * 2) ? (sub / 2) : (sub - num_fp_insts)), inst); //} //asm("// INSTRUCTION DECODING BEGIN"); uint32_t opcode = (inst >> OPCODE_OFFSET) & 15; const uint32_t location = (inst >> LOC_OFFSET) & 1; const uint32_t reg_size_shift = is_fp ? 4 : 3; const uint32_t reg_base_offset = is_fp ? fp_reg_offset : 0; const uint32_t reg_base_src_offset = is_fp ? fp_reg_group_A_offset : 0; uint32_t dst_offset = (inst >> DST_OFFSET) & 7; dst_offset = reg_base_offset + (dst_offset << reg_size_shift); uint32_t src_offset = (inst >> SRC_OFFSET) & 7; src_offset = (src_offset << 3) + (location ? 0 : reg_base_src_offset); __local uint64_t* dst_ptr = (__local uint64_t*)((__local uint8_t*)(R) + dst_offset); __local uint64_t* src_ptr = (__local uint64_t*)((__local uint8_t*)(R) + src_offset); const uint32_t imm_offset = (inst >> IMM_OFFSET) & 255; __local const uint32_t* imm_ptr = imm_buf + imm_offset; uint64_t dst = *dst_ptr; uint64_t src = *src_ptr; uint2 imm; imm.x = imm_ptr[0]; imm.y = imm_ptr[1]; //asm("// INSTRUCTION DECODING END"); if (location) { //asm("// SCRATCHPAD ACCESS BEGIN"); const uint32_t loc_shift = (imm.x >> 21) & 31; const uint32_t mask = (0xFFFFFFFFU >> loc_shift) - 7; const bool is_read = (opcode != 10); uint32_t addr = is_read ? ((loc_shift == LOC_L3) ? 0 : (uint32_t)(src)) : (uint32_t)(dst); addr += (int32_t)(imm.x); addr &= mask; __global uint64_t* ptr = (__global uint64_t*)(scratchpad + addr); if (is_read) { src = *ptr; } else { *ptr = src; goto execution_end; } //asm("// SCRATCHPAD ACCESS END"); } { //asm("// EXECUTION BEGIN"); if (inst & (1 << SRC_IS_IMM32_OFFSET)) src = (uint64_t)((int64_t)((int32_t)(imm.x))); // Check instruction opcodes (most frequent instructions come first) if (opcode <= 3) { //asm("// IADD_RS, IADD_M, ISUB_R, ISUB_M, IMUL_R, IMUL_M, IMUL_RCP, IXOR_R, IXOR_M, FSCAL_R (109/256) ------>"); if (inst & (1 << NEGATIVE_SRC_OFFSET)) src = (uint64_t)(-(int64_t)(src)); if (opcode == 0) dst += (int32_t)(imm.x); const uint32_t shift = (inst >> SHIFT_OFFSET) & 3; if (opcode < 2) dst += src << shift; const uint64_t imm64 = *((uint64_t*) &imm); if (inst & (1 << SRC_IS_IMM64_OFFSET)) src = imm64; if (opcode == 2) dst *= src; if (opcode == 3) dst ^= src; //asm("// <------ IADD_RS, IADD_M, ISUB_R, ISUB_M, IMUL_R, IMUL_M, IMUL_RCP, IXOR_R, IXOR_M, FSCAL_R (109/256)"); } else if (opcode == 12) { //asm("// FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256) ------>"); if (location) src = as_ulong(convert_double_rtn((int32_t)(src >> ((sub & 1) * 32)))); if (inst & (1 << NEGATIVE_SRC_OFFSET)) src ^= 0x8000000000000000UL; const bool is_mul = (inst & (1 << SHIFT_OFFSET)) != 0; const double a = as_double(dst); const double b = as_double(src); dst = as_ulong(fma_soft(a, is_mul ? b : 1.0, is_mul ? 0.0 : b, fprc)); //asm("// <------ FADD_R, FADD_M, FSUB_R, FSUB_M, FMUL_R (74/256)"); } else if (opcode == 9) { //asm("// CBRANCH (16/256) ------>"); dst += (int32_t)(imm.x); if (((uint32_t)(dst) & (ConditionMask << (imm.y & 31))) == 0) { imm_buf[IMM_INDEX_COUNT] = (uint32_t)(((int32_t)(imm.y) >> 5) - num_insts); } //asm("// <------ CBRANCH (16/256)"); } else if (opcode == 7) { //asm("// IROR_R, IROL_R (10/256) ------>"); uint32_t shift1 = src & 63; #if RANDOMX_FREQ_IROL_R > 0 const uint32_t shift2 = 64 - shift1; const bool is_rol = (inst & (1 << NEGATIVE_SRC_OFFSET)); dst = (dst >> (is_rol ? shift2 : shift1)) | (dst << (is_rol ? shift1 : shift2)); #else dst = (dst >> shift1) | (dst << (64 - shift1)); #endif //asm("// <------ IROR_R, IROL_R (10/256)"); } else if (opcode == 14) { //asm("// FSQRT_R (6/256) ------>"); dst = as_ulong(sqrt_rnd(as_double(dst), fprc)); //asm("// <------ FSQRT_R (6/256)"); } else if (opcode == 6) { //asm("// IMULH_R, IMULH_M (5/256) ------>"); dst = mul_hi(dst, src); //asm("// <------ IMULH_R, IMULH_M (5/256)"); } else if (opcode == 4) { //asm("// ISMULH_R, ISMULH_M (5/256) ------>"); dst = (uint64_t)(mul_hi((int64_t)(dst), (int64_t)(src))); //asm("// <------ ISMULH_R, ISMULH_M (5/256)"); } else if (opcode == 11) { //asm("// FSWAP_R (4/256) ------>"); dst = *(__local uint64_t*)((__local uint8_t*)(R) + (dst_offset ^ 8)); //asm("// <------ FSWAP_R (4/256)"); } else if (opcode == 8) { //asm("// ISWAP_R (4/256) ------>"); *src_ptr = dst; dst = src; //asm("// <------ ISWAP_R (4/256)"); } else if (opcode == 15) { //asm("// FDIV_M (4/256) ------>"); src = as_ulong(convert_double_rtn((int32_t)(src >> ((sub & 1) * 32)))); src &= dynamicMantissaMask; src |= xexponentMask; dst = as_ulong(div_rnd(as_double(dst), as_double(src), fprc)); //asm("// <------ FDIV_M (4/256)"); } else if (opcode == 5) { //asm("// INEG_R (2/256) ------>"); dst = (uint64_t)(-(int64_t)(dst)); //asm("// <------ INEG_R (2/256)"); } // CFROUND check will be skipped and removed entirely by the compiler if ROUNDING_MODE >= 0 else if (ROUNDING_MODE < 0) { //asm("// CFROUND (1/256) ------>"); imm_buf[IMM_INDEX_COUNT + 1] = ((src >> imm_offset) | (src << (64 - imm_offset))) & 3; //asm("// <------ CFROUND (1/256)"); goto execution_end; } *dst_ptr = dst; //asm("// EXECUTION END"); } } execution_end: { //asm("// SYNCHRONIZATION OF INSTRUCTION POINTER AND ROUNDING MODE BEGIN"); barrier(CLK_LOCAL_MEM_FENCE); ip = imm_buf[IMM_INDEX_COUNT]; fprc = imm_buf[IMM_INDEX_COUNT + 1]; //asm("// SYNCHRONIZATION OF INSTRUCTION POINTER AND ROUNDING MODE END"); ip += num_insts + 1; } } return fprc; } #if WORKERS_PER_HASH == 16 __attribute__((reqd_work_group_size(32, 1, 1))) #else __attribute__((reqd_work_group_size(16, 1, 1))) #endif __kernel void execute_vm(__global void* vm_states, __global void* rounding, __global void* scratchpads, __global const void* dataset_ptr, uint32_t batch_size, uint32_t num_iterations, uint32_t first, uint32_t last) { // 2 hashes per warp, 4 KB shared memory for VM states __local uint64_t vm_states_local[(VM_STATE_SIZE * 2) / sizeof(uint64_t)]; load_buffer(vm_states_local, sizeof(vm_states_local) / sizeof(uint64_t), vm_states); barrier(CLK_LOCAL_MEM_FENCE); enum { IDX_WIDTH = (WORKERS_PER_HASH == 16) ? 16 : 8 }; __local uint64_t* R = vm_states_local + (get_local_id(0) / IDX_WIDTH) * VM_STATE_SIZE / sizeof(uint64_t); __local double* F = (__local double*)(R + 8); __local double* E = (__local double*)(R + 16); const uint32_t global_index = get_global_id(0); const int32_t idx = global_index / IDX_WIDTH; const int32_t sub = global_index % IDX_WIDTH; uint32_t ma = ((__local uint32_t*)(R + 16))[0]; uint32_t mx = ((__local uint32_t*)(R + 16))[1]; const uint32_t addressRegisters = ((__local uint32_t*)(R + 16))[2]; __local const uint64_t* readReg0 = (__local uint64_t*)(((__local uint8_t*)R) + (addressRegisters & 0xff)); __local const uint64_t* readReg1 = (__local uint64_t*)(((__local uint8_t*)R) + ((addressRegisters >> 8) & 0xff)); __local const uint32_t* readReg2 = (__local uint32_t*)(((__local uint8_t*)R) + ((addressRegisters >> 16) & 0xff)); __local const uint32_t* readReg3 = (__local uint32_t*)(((__local uint8_t*)R) + (addressRegisters >> 24)); const uint32_t datasetOffset = ((__local uint32_t*)(R + 16))[3]; __global const uint8_t* dataset = ((__global const uint8_t*)dataset_ptr) + datasetOffset; const uint32_t fp_reg_offset = 64 + ((global_index & 1) << 3); const uint32_t fp_reg_group_A_offset = 192 + ((global_index & 1) << 3); __local uint64_t* eMask = R + 18; const uint32_t program_length = ((__local uint32_t*)(R + 20))[0]; uint32_t fprc = ((__global uint32_t*)rounding)[idx]; uint32_t spAddr0 = first ? mx : 0; uint32_t spAddr1 = first ? ma : 0; __global uint8_t* scratchpad = ((__global uint8_t*)scratchpads) + idx * (uint64_t)(RANDOMX_SCRATCHPAD_L3 + 64); const bool f_group = (sub < 4); __local double* fe = f_group ? (F + sub * 2) : (E + (sub - 4) * 2); __local double* f = F + sub; __local double* e = E + sub; const uint64_t andMask = f_group ? (uint64_t)(-1) : dynamicMantissaMask; const uint64_t orMask1 = f_group ? 0 : eMask[0]; const uint64_t orMask2 = f_group ? 0 : eMask[1]; const uint64_t xexponentMask = (sub & 1) ? eMask[1] : eMask[0]; __local uint32_t* imm_buf = (__local uint32_t*)(R + REGISTERS_SIZE / sizeof(uint64_t)); __local const uint32_t* compiled_program = (__local const uint32_t*)(R + (REGISTERS_SIZE + IMM_BUF_SIZE) / sizeof(uint64_t)); const uint32_t workers_mask = ((1 << WORKERS_PER_HASH) - 1) << ((get_local_id(0) / IDX_WIDTH) * IDX_WIDTH); const uint32_t fp_workers_mask = 3 << (((sub >> 1) << 1) + (get_local_id(0) / IDX_WIDTH) * IDX_WIDTH); #pragma unroll 1 for (int ic = 0; ic < num_iterations; ++ic) { __local uint64_t *r; __global uint64_t *p0, *p1; if ((WORKERS_PER_HASH <= 8) || (sub < 8)) { const uint64_t spMix = *readReg0 ^ *readReg1; spAddr0 ^= ((const uint32_t*)&spMix)[0]; spAddr1 ^= ((const uint32_t*)&spMix)[1]; spAddr0 &= ScratchpadL3Mask64; spAddr1 &= ScratchpadL3Mask64; p0 = (__global uint64_t*)(scratchpad + spAddr0 + sub * 8); p1 = (__global uint64_t*)(scratchpad + spAddr1 + sub * 8); r = R + sub; *r ^= *p0; uint64_t global_mem_data = *p1; int32_t* q = (int32_t*)&global_mem_data; fe[0] = load_F_E_groups(q[0], andMask, orMask1); fe[1] = load_F_E_groups(q[1], andMask, orMask2); } //if ((global_index == 0) && (ic == 0)) //{ // printf("ic = %d (before)\n", ic); // for (int i = 0; i < 8; ++i) // printf("f%d = %016llx\n", i, bit_cast(F[i])); // for (int i = 0; i < 8; ++i) // printf("e%d = %016llx\n", i, bit_cast(E[i])); // printf("\n"); //} if ((WORKERS_PER_HASH == IDX_WIDTH) || (sub < WORKERS_PER_HASH)) fprc = inner_loop(program_length, compiled_program, sub, scratchpad, fp_reg_offset, fp_reg_group_A_offset, R, imm_buf, batch_size, fprc, fp_workers_mask, xexponentMask, workers_mask); //if ((global_index == 0) && (ic == RANDOMX_PROGRAM_ITERATIONS - 1)) //{ // printf("ic = %d (after)\n", ic); // for (int i = 0; i < 8; ++i) // printf("r%d = %016llx\n", i, R[i]); // for (int i = 0; i < 8; ++i) // printf("f%d = %016llx\n", i, bit_cast(F[i])); // for (int i = 0; i < 8; ++i) // printf("e%d = %016llx\n", i, bit_cast(E[i])); // printf("\n"); //} if ((WORKERS_PER_HASH <= 8) || (sub < 8)) { mx ^= *readReg2 ^ *readReg3; mx &= CacheLineAlignMask; const uint64_t next_r = *r ^ *(__global const uint64_t*)(dataset + ma + sub * 8); *r = next_r; *p1 = next_r; *p0 = as_ulong(f[0]) ^ as_ulong(e[0]); uint32_t tmp = ma; ma = mx; mx = tmp; spAddr0 = 0; spAddr1 = 0; } } //if (global_index == 0) //{ // for (int i = 0; i < 8; ++i) // printf("r%d = %016llx\n", i, R[i]); // for (int i = 0; i < 8; ++i) // printf("fe%d = %016llx\n", i, bit_cast(F[i]) ^ bit_cast(E[i])); // printf("\n"); //} if ((WORKERS_PER_HASH > 8) && (sub >= 8)) return; __global uint64_t* p = ((__global uint64_t*)vm_states) + idx * (VM_STATE_SIZE / sizeof(uint64_t)); p[sub] = R[sub]; if (sub == 0) { ((__global uint32_t*)rounding)[idx] = fprc; } if (last) { p[sub + 8] = as_ulong(F[sub]) ^ as_ulong(E[sub]); p[sub + 16] = as_ulong(E[sub]); } else if (sub == 0) { ((__global uint32_t*)(p + 16))[0] = ma; ((__global uint32_t*)(p + 16))[1] = mx; } } __attribute__((reqd_work_group_size(64, 1, 1))) __kernel void find_shares(__global const uint64_t* hashes, uint64_t target, uint32_t start_nonce, __global uint32_t* shares) { const uint32_t global_index = get_global_id(0); if (hashes[global_index * 4 + 3] < target) { //if (global_index == 0) { const uint32_t idx = atomic_inc(shares + 0xFF); if (idx < 0xFF) { shares[idx] = start_nonce + global_index; } } }