Support for Cryptonight variant 4 (Monero)

This commit is contained in:
SChernykh 2019-02-17 18:17:14 +01:00
parent 2df204f8a8
commit 764767d317
28 changed files with 2610 additions and 253 deletions

View file

@ -12,8 +12,11 @@ enum V4_Settings
TOTAL_LATENCY = 15 * 3,
// Always generate at least 60 instructions
NUM_INSTRUCTIONS = 60,
NUM_INSTRUCTIONS_MIN = 60,
// Never generate more than 70 instructions (final RET instruction doesn't count here)
NUM_INSTRUCTIONS_MAX = 70,
// Available ALUs for MUL
// Modern CPUs typically have only 1 ALU which can do multiplications
ALU_COUNT_MUL = 1,
@ -38,10 +41,9 @@ enum V4_InstructionList
// V4_InstructionDefinition is used to generate code from random data
// Every random sequence of bytes is a valid code
//
// There are 8 registers in total:
// There are 9 registers in total:
// - 4 variable registers
// - 4 constant registers initialized from loop variables
//
// - 5 constant registers initialized from loop variables
// This is why dst_index is 2 bits
enum V4_InstructionDefinition
{
@ -144,16 +146,16 @@ static void v4_random_math(const struct V4_Instruction* code, v4_reg* r)
// Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency
// I've checked all block heights < 10,000,000 and here is the distribution of program sizes:
//
// 60 28495
// 61 106077
// 62 2455855
// 63 5114930
// 64 1020868
// 65 1109026
// 66 151756
// 67 8429
// 68 4477
// 69 87
// 60 27960
// 61 105054
// 62 2452759
// 63 5115997
// 64 1022269
// 65 1109635
// 66 153145
// 67 8550
// 68 4529
// 69 102
// Unroll 70 instructions here
V4_EXEC_10(0); // instructions 0-9
@ -179,6 +181,8 @@ static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed
}
// Generates as many random math operations as possible with given latency and ALU restrictions
// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions
template<xmrig::Variant VARIANT>
static int v4_random_math_init(struct V4_Instruction* code, const uint64_t height)
{
// MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle
@ -200,6 +204,10 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
memset(data, 0, sizeof(data));
uint64_t tmp = SWAP64LE(height);
memcpy(data, &tmp, sizeof(uint64_t));
if (VARIANT == xmrig::VARIANT_4)
{
data[20] = -38;
}
// Set data_index past the last byte in data
// to trigger full data update with blake hash
@ -207,18 +215,22 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
size_t data_index = sizeof(data);
int code_size;
// There is a small chance (1.8%) that register R8 won't be used in the generated program
// So we keep track of it and try again if it's not used
bool r8_used;
do {
int latency[8];
int asic_latency[8];
int latency[9];
int asic_latency[9];
// Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution
// byte 0: current value of the destination register
// byte 1: instruction opcode
// byte 2: current value of the source register
//
// Registers R4-R7 are constant and are treated as having the same value because when we do
// Registers R4-R8 are constant and are treated as having the same value because when we do
// the same operation twice with two constant source registers, it can be optimized into a single operation
uint32_t inst_data[8] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF };
bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT];
bool is_rotation[V4_INSTRUCTION_COUNT];
@ -237,6 +249,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
code_size = 0;
int total_iterations = 0;
r8_used = (VARIANT == xmrig::VARIANT_WOW);
// Generate random code to achieve minimal required latency for our abstract CPU
// Try to get this latency for all 4 registers
@ -281,7 +294,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b))
{
// a is always < 4, so we don't need to check bounds here
b = a + 4;
b = (VARIANT == xmrig::VARIANT_WOW) ? (a + 4) : 8;
src_index = b;
}
@ -362,6 +375,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
code[code_size].src_index = src_index;
code[code_size].C = 0;
if (src_index == 8)
{
r8_used = true;
}
if (opcode == ADD)
{
// ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too
@ -376,7 +394,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
}
++code_size;
if (code_size >= NUM_INSTRUCTIONS)
if (code_size >= NUM_INSTRUCTIONS_MIN)
{
break;
}
@ -391,7 +409,7 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
// We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC
// Get this latency for at least 1 of the 4 registers
const int prev_code_size = code_size;
while ((asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY))
{
int min_idx = 0;
int max_idx = 0;
@ -413,9 +431,11 @@ static int v4_random_math_init(struct V4_Instruction* code, const uint64_t heigh
++code_size;
}
// There is ~99.8% chance that code_size >= NUM_INSTRUCTIONS here, so second iteration is required rarely
} while (code_size < NUM_INSTRUCTIONS);
// There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time
// It never does more than 4 iterations for all block heights < 10,000,000
} while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX));
// It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here
// Add final instruction to stop the interpreter
code[code_size].opcode = RET;
code[code_size].dst_index = 0;