Integrated cn-conceal/ccx for x86 arch

This commit is contained in:
Ben Gräf 2019-07-21 11:50:29 +02:00
parent 836e8b24f3
commit 8ae1065085
5 changed files with 251 additions and 17 deletions

View file

@ -354,6 +354,7 @@ constexpr static const char *pow_variant_names[] = {
"zls",
"graft",
"upx2",
"conceal",
"chukwa",
"wrkz"
};
@ -1311,6 +1312,11 @@ bool Options::parsePowVariant(const char *powVariant)
break;
}
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "conceal") || !strcmp(powVariant, "ccx"))) {
m_powVariant = POW_CONCEAL;
break;
}
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "chukwa"))) {
m_powVariant = POW_ARGON2_CHUKWA;
break;

View file

@ -45,6 +45,7 @@ enum PowVariant
POW_ZELERIUS,
POW_RWZ,
POW_UPX2,
POW_CONCEAL,
POW_ARGON2_CHUKWA,
POW_ARGON2_WRKZ,
LAST_ITEM
@ -94,6 +95,8 @@ inline std::string getPowVariantName(PowVariant powVariant)
return "rwz";
case POW_UPX2:
return "upx2";
case POW_CONCEAL:
return "conceal";
case POW_ARGON2_CHUKWA:
return "chukwa";
case POW_ARGON2_WRKZ:
@ -185,6 +188,8 @@ inline PowVariant parseVariant(const std::string variant)
powVariant = PowVariant::POW_RWZ;
} else if (variant == "upx2") {
powVariant = PowVariant::POW_UPX2;
} else if (variant == "conceal" || variant == "ccx") {
powVariant = PowVariant::POW_CONCEAL;
} else if (variant == "chukwa" || variant == "trtl-chukwa" || variant == "argon2-chukwa") {
powVariant = PowVariant::POW_ARGON2_CHUKWA;
} else if (variant == "chukwa_wrkz" || variant == "wrkz" || variant == "argon2-wrkz") {

View file

@ -69,6 +69,21 @@ const static uint8_t test_output_v0[160] = {
};
// CN CONCEAL
const static uint8_t test_output_conceal[160] = {
0xB3, 0xA1, 0x67, 0x86, 0xD2, 0xC9, 0x85, 0xEC, 0xAD, 0xC4, 0x5F, 0x91, 0x05, 0x27, 0xC7, 0xA1,
0x96, 0xF0, 0xE1, 0xE9, 0x7C, 0x87, 0x09, 0x38, 0x1D, 0x7D, 0x41, 0x93, 0x35, 0xF8, 0x16, 0x72,
0xC3, 0xBD, 0x8D, 0xE8, 0xD5, 0xAE, 0xB8, 0x59, 0x0A, 0x6C, 0xCB, 0x7B, 0x41, 0x30, 0xF7, 0x04,
0xA5, 0x7C, 0xF9, 0xCA, 0x20, 0x49, 0x9C, 0xFD, 0xE8, 0x43, 0xCF, 0x66, 0x78, 0xEA, 0x76, 0xDD,
0x91, 0x0C, 0xDE, 0x29, 0x2A, 0xE0, 0xA8, 0xCA, 0xBC, 0xAA, 0x53, 0x4C, 0x93, 0x3E, 0x7B, 0x2C,
0xF1, 0xF9, 0xE1, 0x98, 0xB2, 0x92, 0x1E, 0x19, 0x93, 0x2A, 0x74, 0x9D, 0xDB, 0x10, 0x0F, 0x16,
0xD5, 0x3D, 0xE4, 0xC4, 0x23, 0xD9, 0x2E, 0xFD, 0x79, 0x8D, 0x1E, 0x48, 0x4E, 0x46, 0x08, 0x6C,
0xFF, 0x8A, 0x49, 0xFA, 0x1E, 0xB0, 0xB6, 0x9A, 0x47, 0x1C, 0xC6, 0x30, 0x36, 0x5D, 0xFD, 0x76,
0x10, 0x07, 0x44, 0xE6, 0xC8, 0x20, 0x2A, 0x84, 0x9D, 0x70, 0x22, 0x00, 0x8B, 0x9B, 0xBD, 0x8D,
0x27, 0x49, 0xA6, 0x06, 0xDC, 0xF0, 0xA1, 0x4B, 0x50, 0xA0, 0x12, 0xCD, 0x77, 0x01, 0x4C, 0x28
};
// CN v7
const static uint8_t test_output_v1[160] = {
0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,

View file

@ -191,11 +191,14 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin
#ifdef _MSC_VER
# define SET_ROUNDING_MODE_UP() _control87(RC_UP, MCW_RC);
# define SET_ROUNDING_MODE_DOWN() _control87(RC_DOWN, MCW_RC);
# define SET_ROUNDING_MODE_NEAREST() _control87(RC_NEAR, MCW_RC);;
#else
# define SET_ROUNDING_MODE_UP() std::fesetround(FE_UPWARD);
# define SET_ROUNDING_MODE_DOWN() fesetround(FE_DOWNWARD);
# define SET_ROUNDING_MODE_NEAREST() fesetround(FE_TONEAREST);
#endif
# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax, reverse) \
{ \
const __m128i chunk1 = _mm_load_si128((__m128i *)((l) + ((idx) ^ (reverse ? 0x30 : 0x10)))); \
@ -663,7 +666,27 @@ int_sqrt_v2_fixup(r, n0);
return r;
}
// n-Loop version. Seems to be little bit slower then the hardcoded one.
inline __m128 _mm_set1_ps_epi32(uint32_t x)
{
return _mm_castsi128_ps(_mm_set1_epi32(x));
}
inline void cryptonight_conceal_tweak(__m128i& cx, __m128& conc_var)
{
__m128 r = _mm_cvtepi32_ps(cx);
__m128 c_old = conc_var;
r = _mm_add_ps(r, conc_var);
r = _mm_mul_ps(r, _mm_mul_ps(r, r));
r = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), r);
r = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), r);
conc_var = _mm_add_ps(conc_var, r);
c_old = _mm_and_ps(_mm_set1_ps_epi32(0x807FFFFF), c_old);
c_old = _mm_or_ps(_mm_set1_ps_epi32(0x40000000), c_old);
__m128 nc = _mm_mul_ps(c_old, _mm_set1_ps(536870880.0f));
cx = _mm_xor_si128(cx, _mm_cvttps_epi32(nc));
}
template<size_t ITERATIONS, size_t INDEX_SHIFT, size_t MEM, size_t MASK, bool SOFT_AES, PowVariant VARIANT, size_t NUM_HASH_BLOCKS>
class CryptoNightMultiHash
{
@ -768,8 +791,14 @@ public:
uint64_t* h;
uint64_t al;
uint64_t ah;
__m128i bx;
uint64_t idx;
__m128i bx;
__m128 conc_var;
if (VARIANT == POW_CONCEAL) {
SET_ROUNDING_MODE_NEAREST()
conc_var = _mm_setzero_ps();
}
keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
@ -787,9 +816,19 @@ public:
__m128i cx;
if (SOFT_AES) {
if (VARIANT == POW_CONCEAL) {
cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
cryptonight_conceal_tweak(cx, conc_var);
cx = soft_aesenc(cx, _mm_set_epi64x(ah, al));
} else {
cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al));
}
} else {
cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
if (VARIANT == POW_CONCEAL)
cryptonight_conceal_tweak(cx, conc_var);
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al));
}
@ -1552,6 +1591,15 @@ public:
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
__m128 conc_var0;
__m128 conc_var1;
if (VARIANT == POW_CONCEAL) {
SET_ROUNDING_MODE_NEAREST()
conc_var0 = _mm_setzero_ps();
conc_var1 = _mm_setzero_ps();
}
uint64_t idx0 = h0[0] ^h0[4];
uint64_t idx1 = h1[0] ^h1[4];
@ -1560,12 +1608,28 @@ public:
__m128i cx1;
if (SOFT_AES) {
if (VARIANT == POW_CONCEAL) {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
} else {
cx0 = soft_aesenc((uint32_t *) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc((uint32_t *) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
}
} else {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
if (VARIANT == POW_CONCEAL) {
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
}
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
}
@ -2572,6 +2636,17 @@ public:
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
__m128 conc_var0;
__m128 conc_var1;
__m128 conc_var2;
if (VARIANT == POW_CONCEAL) {
SET_ROUNDING_MODE_NEAREST()
conc_var0 = _mm_setzero_ps();
conc_var1 = _mm_setzero_ps();
conc_var2 = _mm_setzero_ps();
}
uint64_t idx0 = h0[0] ^h0[4];
uint64_t idx1 = h1[0] ^h1[4];
uint64_t idx2 = h2[0] ^h2[4];
@ -2582,14 +2657,34 @@ public:
__m128i cx2;
if (SOFT_AES) {
if (VARIANT == POW_CONCEAL) {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
cryptonight_conceal_tweak(cx2, conc_var2);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
cx2 = soft_aesenc(cx2, _mm_set_epi64x(ah2, al2));
} else {
cx0 = soft_aesenc((uint32_t *) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc((uint32_t *) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
cx2 = soft_aesenc((uint32_t *) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
}
} else {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
if (VARIANT == POW_CONCEAL) {
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
cryptonight_conceal_tweak(cx2, conc_var2);
}
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
@ -3771,6 +3866,19 @@ public:
__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
__m128 conc_var0;
__m128 conc_var1;
__m128 conc_var2;
__m128 conc_var3;
if (VARIANT == POW_CONCEAL) {
SET_ROUNDING_MODE_NEAREST()
conc_var0 = _mm_setzero_ps();
conc_var1 = _mm_setzero_ps();
conc_var2 = _mm_setzero_ps();
conc_var3 = _mm_setzero_ps();
}
uint64_t idx0 = h0[0] ^h0[4];
uint64_t idx1 = h1[0] ^h1[4];
uint64_t idx2 = h2[0] ^h2[4];
@ -3783,16 +3891,40 @@ public:
__m128i cx3;
if (SOFT_AES) {
if (VARIANT == POW_CONCEAL) {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
cryptonight_conceal_tweak(cx2, conc_var2);
cryptonight_conceal_tweak(cx3, conc_var3);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
cx2 = soft_aesenc(cx2, _mm_set_epi64x(ah2, al2));
cx3 = soft_aesenc(cx3, _mm_set_epi64x(ah3, al3));
} else {
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
}
} else {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
if (VARIANT == POW_CONCEAL) {
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
cryptonight_conceal_tweak(cx2, conc_var2);
cryptonight_conceal_tweak(cx3, conc_var3);
}
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
@ -4821,6 +4953,21 @@ public:
__m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
__m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
__m128 conc_var0;
__m128 conc_var1;
__m128 conc_var2;
__m128 conc_var3;
__m128 conc_var4;
if (VARIANT == POW_CONCEAL) {
SET_ROUNDING_MODE_NEAREST()
conc_var0 = _mm_setzero_ps();
conc_var1 = _mm_setzero_ps();
conc_var2 = _mm_setzero_ps();
conc_var3 = _mm_setzero_ps();
conc_var4 = _mm_setzero_ps();
}
uint64_t idx0 = h0[0] ^h0[4];
uint64_t idx1 = h1[0] ^h1[4];
uint64_t idx2 = h2[0] ^h2[4];
@ -4835,11 +4982,31 @@ public:
__m128i cx4;
if (SOFT_AES) {
if (VARIANT == POW_CONCEAL) {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
cryptonight_conceal_tweak(cx2, conc_var2);
cryptonight_conceal_tweak(cx3, conc_var3);
cryptonight_conceal_tweak(cx4, conc_var4);
cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1));
cx2 = soft_aesenc(cx2, _mm_set_epi64x(ah2, al2));
cx3 = soft_aesenc(cx3, _mm_set_epi64x(ah3, al3));
cx4 = soft_aesenc(cx4, _mm_set_epi64x(ah4, al4));
} else {
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
}
} else {
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
@ -4847,6 +5014,14 @@ public:
cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
if (VARIANT == POW_CONCEAL) {
cryptonight_conceal_tweak(cx0, conc_var0);
cryptonight_conceal_tweak(cx1, conc_var1);
cryptonight_conceal_tweak(cx2, conc_var2);
cryptonight_conceal_tweak(cx3, conc_var3);
cryptonight_conceal_tweak(cx4, conc_var4);
}
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));

View file

@ -173,6 +173,8 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, uint64_t height,
#endif
} else if (variant == PowVariant::POW_XFH) {
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_XFH, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad);
} else if (variant == PowVariant::POW_CONCEAL) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_CONCEAL, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
} else {
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V0, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
}
@ -297,6 +299,8 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, uint64_t height
#endif
} else if (variant == PowVariant::POW_XFH) {
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_XFH, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad);
} else if (variant == PowVariant::POW_CONCEAL) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_CONCEAL, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
} else {
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V0, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
}
@ -845,6 +849,35 @@ bool HashSelector::selfCheck(Options::Algo algo)
#if MAX_NUM_HASH_BLOCKS > 4
hash_ctx[4](asmOptimization, 0, PowVariant::POW_V2, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_v2, 160) == 0;
#endif
// cn conceal
#if !defined(XMRIG_ARM)
hash_ctx[0](asmOptimization, 0, PowVariant::POW_CONCEAL, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_conceal, 32) == 0;
#if MAX_NUM_HASH_BLOCKS > 1
hash_ctx[1](asmOptimization, 0, PowVariant::POW_CONCEAL, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_conceal, 64) == 0;
#endif
#if MAX_NUM_HASH_BLOCKS > 2
hash_ctx[2](asmOptimization, 0, PowVariant::POW_CONCEAL, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_conceal, 96) == 0;
#endif
#if MAX_NUM_HASH_BLOCKS > 3
hash_ctx[3](asmOptimization, 0, PowVariant::POW_CONCEAL, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_conceal, 128) == 0;
#endif
#if MAX_NUM_HASH_BLOCKS > 4
hash_ctx[4](asmOptimization, 0, PowVariant::POW_CONCEAL, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_conceal, 160) == 0;
#endif
#endif
// cn xfh aka cn-heavy-superfast