Add ASM tweak for XLT and Fix for 32bit

This commit is contained in:
BenDr0id 2018-10-16 08:47:09 +02:00
parent 857e3193dd
commit 6e6dfd93fc
3 changed files with 47 additions and 7 deletions

View file

@ -59,7 +59,15 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
} else if (powVersion == PowVariant::POW_ALLOY) { } else if (powVersion == PowVariant::POW_ALLOY) {
CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
} else if (powVersion == PowVariant::POW_XTL) { } else if (powVersion == PowVariant::POW_XTL) {
#if defined(XMRIG_ARM)
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
#else
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
} else {
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
}
#endif
} else if (powVersion == PowVariant::POW_MSR) { } else if (powVersion == PowVariant::POW_MSR) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
} else if (powVersion == PowVariant::POW_RTO) { } else if (powVersion == PowVariant::POW_RTO) {
@ -95,7 +103,15 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV
} else if (powVersion == PowVariant::POW_ALLOY) { } else if (powVersion == PowVariant::POW_ALLOY) {
CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
} else if (powVersion == PowVariant::POW_XTL) { } else if (powVersion == PowVariant::POW_XTL) {
#if defined(XMRIG_ARM)
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
#else
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
} else {
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
}
#endif
} else if (powVersion == PowVariant::POW_MSR) { } else if (powVersion == PowVariant::POW_MSR) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
} else if (powVersion == PowVariant::POW_RTO) { } else if (powVersion == PowVariant::POW_RTO) {
@ -201,8 +217,11 @@ bool CryptoNight::init(int algo, bool aesni)
{ {
for (int i = 0; i < 256; ++i) for (int i = 0; i < 256; ++i)
{ {
const uint64_t index = (((i >> 3) & 6) | (i & 1)) << 1; const uint64_t index = (((i >> POW_DEFAULT_INDEX_SHIFT) & 6) | (i & 1)) << 1;
const uint64_t index_xtl = (((i >> POW_XLT_V4_INDEX_SHIFT) & 6) | (i & 1)) << 1;
variant1_table[i] = i ^ ((0x75310 >> index) & 0x30); variant1_table[i] = i ^ ((0x75310 >> index) & 0x30);
variant_xtl_table[i] = i ^ ((0x75310 >> index_xtl) & 0x30);
} }
setCryptoNightHashMethods<MAX_NUM_HASH_BLOCKS>(static_cast<Options::Algo>(algo), aesni); setCryptoNightHashMethods<MAX_NUM_HASH_BLOCKS>(static_cast<Options::Algo>(algo), aesni);

View file

@ -45,11 +45,12 @@ struct ScratchPad {
// Additional stuff for asm impl // Additional stuff for asm impl
uint8_t ctx_info[24]; uint8_t ctx_info[24];
const void* input; const void* input;
uint8_t* variant1_table; uint8_t* variant_table;
const uint32_t* t_fn; const uint32_t* t_fn;
}; };
alignas(64) static uint8_t variant1_table[256]; alignas(64) static uint8_t variant1_table[256];
alignas(64) static uint8_t variant_xtl_table[256];
class Job; class Job;
class JobResult; class JobResult;

View file

@ -1409,14 +1409,18 @@ public:
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l); cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
#ifndef XMRIG_NO_ASM #ifndef XMRIG_NO_ASM
if (SOFT_AES) { if (INDEX_SHIFT == POW_DEFAULT_INDEX_SHIFT) {
scratchPad[0]->variant_table = variant1_table;
} else {
scratchPad[0]->variant_table = variant_xtl_table;
}
scratchPad[0]->input = input; scratchPad[0]->input = input;
scratchPad[0]->variant1_table = variant1_table;
if (SOFT_AES) {
scratchPad[0]->t_fn = (const uint32_t*)saes_table; scratchPad[0]->t_fn = (const uint32_t*)saes_table;
cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
} else { } else {
scratchPad[0]->input = input;
scratchPad[0]->variant1_table = variant1_table;
cnv1_mainloop_sandybridge_asm(scratchPad[0]); cnv1_mainloop_sandybridge_asm(scratchPad[0]);
} }
#endif #endif
@ -2071,8 +2075,15 @@ public:
uint64_t idx0 = h0[0] ^h0[4]; uint64_t idx0 = h0[0] ^h0[4];
uint64_t idx1 = h1[0] ^h1[4]; uint64_t idx1 = h1[0] ^h1[4];
#if defined(__x86_64__) || defined(_M_AMD64)
__m128i division_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[12]), _mm_cvtsi64_si128(h1[12])); __m128i division_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[12]), _mm_cvtsi64_si128(h1[12]));
__m128i sqrt_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[13]), _mm_cvtsi64_si128(h1[13])); __m128i sqrt_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[13]), _mm_cvtsi64_si128(h1[13]));
#else
__m128i division_result_xmm0 = _mm_cvtsi64_si128(h0[12]);
__m128i division_result_xmm1 = _mm_cvtsi64_si128(h1[12]);
uint64_t sqrt_result0 = h0[13];
uint64_t sqrt_result1 = h1[13];
#endif
SET_ROUNDING_MODE_UP() SET_ROUNDING_MODE_UP()
@ -2107,8 +2118,12 @@ public:
cl = ((uint64_t*) &l0[idx0 & MASK])[0]; cl = ((uint64_t*) &l0[idx0 & MASK])[0];
ch = ((uint64_t*) &l0[idx0 & MASK])[1]; ch = ((uint64_t*) &l0[idx0 & MASK])[1];
#if defined(__x86_64__) || defined(_M_AMD64)
const uint64_t sqrt_result0 = _mm_cvtsi128_si64(sqrt_result_xmm); const uint64_t sqrt_result0 = _mm_cvtsi128_si64(sqrt_result_xmm);
cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result0 << 32); cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result0 << 32);
#else
INTEGER_MATH_V2(0, cl, cx0)
#endif
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
@ -2127,9 +2142,11 @@ public:
bx10 = bx00; bx10 = bx00;
bx00 = cx0; bx00 = cx0;
cl = ((uint64_t*) &l1[idx1 & MASK])[0]; cl = ((uint64_t*) &l1[idx1 & MASK])[0];
ch = ((uint64_t*) &l1[idx1 & MASK])[1]; ch = ((uint64_t*) &l1[idx1 & MASK])[1];
#if defined(__x86_64__) || defined(_M_AMD64)
const uint64_t sqrt_result1 = _mm_cvtsi128_si64(_mm_srli_si128(sqrt_result_xmm, 8)); const uint64_t sqrt_result1 = _mm_cvtsi128_si64(_mm_srli_si128(sqrt_result_xmm, 8));
cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_srli_si128(division_result_xmm, 8))) ^ (sqrt_result1 << 32); cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_srli_si128(division_result_xmm, 8))) ^ (sqrt_result1 << 32);
@ -2173,6 +2190,9 @@ public:
r1 = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_srli_si128(_mm_castpd_si128(x), 8))); r1 = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_srli_si128(_mm_castpd_si128(x), 8)));
int_sqrt_v2_fixup(r1, _mm_cvtsi128_si64(_mm_srli_si128(sqrt_input, 8))); int_sqrt_v2_fixup(r1, _mm_cvtsi128_si64(_mm_srli_si128(sqrt_input, 8)));
sqrt_result_xmm = _mm_set_epi64x(r1, r0); sqrt_result_xmm = _mm_set_epi64x(r1, r0);
#else
INTEGER_MATH_V2(1, cl, cx1)
#endif
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);