Workaround multihash/self test of haevy variants when compiled with gcc8
This commit is contained in:
parent
9a28900750
commit
4001331310
4 changed files with 204 additions and 182 deletions
|
@ -9,10 +9,10 @@ endif()
|
|||
if (CMAKE_CXX_COMPILER_ID MATCHES GNU)
|
||||
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-strict-aliasing")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
|
||||
set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast")
|
||||
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fno-exceptions -fno-rtti")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2")
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fno-exceptions -fno-rtti -Wno-class-memaccess")
|
||||
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -s")
|
||||
|
||||
if (XMRIG_ARMv8)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crypto")
|
||||
|
|
|
@ -647,6 +647,16 @@ bool CryptoNight::selfTest(int algo)
|
|||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xfh, 32) == 0;
|
||||
|
||||
#if MAX_NUM_HASH_BLOCKS > 1
|
||||
cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xfh, 64) == 0;
|
||||
#endif
|
||||
|
||||
#if MAX_NUM_HASH_BLOCKS > 2
|
||||
cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xfh, 96) == 0;
|
||||
#endif
|
||||
|
||||
// cnv8 + xtl aka cn-fast2
|
||||
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads);
|
||||
|
|
|
@ -117,9 +117,13 @@ const static uint8_t test_output_alloy[32] = {
|
|||
};
|
||||
|
||||
// CN XFH
|
||||
const static uint8_t test_output_xfh[32] = {
|
||||
const static uint8_t test_output_xfh[96] = {
|
||||
0x40, 0x86, 0x5A, 0xA8, 0x87, 0x41, 0xEC, 0x1D, 0xCC, 0xBD, 0x2B, 0xC6, 0xFF, 0x36, 0xB9, 0x4D,
|
||||
0x54, 0x71, 0x58, 0xDB, 0x94, 0x69, 0x8E, 0x3C, 0xA0, 0x3D, 0xE4, 0x81, 0x9A, 0x65, 0x9F, 0xEF
|
||||
0x54, 0x71, 0x58, 0xDB, 0x94, 0x69, 0x8E, 0x3C, 0xA0, 0x3D, 0xE4, 0x81, 0x9A, 0x65, 0x9F, 0xEF,
|
||||
0x52, 0x97, 0x35, 0x9E, 0xF7, 0x62, 0x9B, 0x1A, 0x9A, 0x9F, 0xE5, 0x0E, 0x50, 0x5F, 0xE6, 0xFB,
|
||||
0xA4, 0xCF, 0x30, 0x47, 0xF7, 0xFE, 0x68, 0x4F, 0x7C, 0x87, 0x6A, 0xA6, 0x60, 0x3E, 0xD5, 0x3E,
|
||||
0xCC, 0x30, 0xE5, 0xAB, 0xF3, 0x6A, 0x25, 0xB1, 0xD8, 0x89, 0xB8, 0x87, 0xE3, 0x61, 0x1D, 0x1E,
|
||||
0xD5, 0x3D, 0x5D, 0x38, 0x29, 0x0F, 0x88, 0xDA, 0xE1, 0x69, 0x12, 0x4F, 0xA2, 0x4F, 0x24, 0x04
|
||||
};
|
||||
|
||||
// CN XTL V9
|
||||
|
@ -181,17 +185,13 @@ const static uint8_t test_output_upx[32] = {
|
|||
};
|
||||
|
||||
// CN-Heavy
|
||||
const static uint8_t test_output_heavy[160] = {
|
||||
const static uint8_t test_output_heavy[96] = {
|
||||
0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64,
|
||||
0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2,
|
||||
0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A,
|
||||
0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D,
|
||||
0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7,
|
||||
0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD,
|
||||
0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1,
|
||||
0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9,
|
||||
0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51,
|
||||
0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB
|
||||
0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD
|
||||
};
|
||||
|
||||
// CN-Heavy Haven
|
||||
|
@ -206,12 +206,12 @@ const static uint8_t test_output_heavy_haven[96] = {
|
|||
|
||||
// CN-Heavy Tube
|
||||
const static uint8_t test_output_heavy_tube[96] = {
|
||||
0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf,
|
||||
0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35,
|
||||
0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3,
|
||||
0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74,
|
||||
0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f,
|
||||
0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb
|
||||
0xFE, 0x53, 0x35, 0x20, 0x76, 0xEA, 0xE6, 0x89, 0xFA, 0x3B, 0x4F, 0xDA, 0x61, 0x46, 0x34, 0xCF,
|
||||
0xC3, 0x12, 0xEE, 0x0C, 0x38, 0x7D, 0xF2, 0xB8, 0xB7, 0x4D, 0xA2, 0xA1, 0x59, 0x74, 0x12, 0x35,
|
||||
0xCD, 0x3F, 0x29, 0xDF, 0x07, 0x4A, 0x14, 0xAD, 0x0B, 0x98, 0x99, 0x37, 0xCA, 0x14, 0x68, 0xA3,
|
||||
0x8D, 0xAE, 0x86, 0xC1, 0xA3, 0x54, 0x05, 0xBE, 0xEA, 0x6D, 0x29, 0x24, 0x0C, 0x82, 0x97, 0x74,
|
||||
0xA0, 0x64, 0x77, 0xCD, 0x8D, 0x8A, 0xC3, 0x10, 0xB4, 0x89, 0x0E, 0xBB, 0x7D, 0xE6, 0x32, 0x8F,
|
||||
0xF4, 0x2D, 0xB6, 0x9E, 0x8A, 0xF9, 0xF8, 0xEE, 0x2C, 0xD0, 0x74, 0xED, 0xA9, 0xAA, 0xA1, 0xFB
|
||||
};
|
||||
|
||||
// CN-Ultralite/Turtle
|
||||
|
|
|
@ -2447,12 +2447,13 @@ public:
|
|||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad)
|
||||
{
|
||||
keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
|
||||
keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
|
||||
keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
|
||||
keccak(static_cast<const uint8_t*>(input) + size, (int) size, scratchPad[1]->state, 200);
|
||||
|
||||
const uint8_t* l0 = scratchPad[0]->memory;
|
||||
const uint8_t* l1 = scratchPad[1]->memory;
|
||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
|
||||
|
||||
const uint8_t* l1 = scratchPad[1]->memory;
|
||||
uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
|
||||
|
||||
cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
|
||||
|
@ -2460,82 +2461,88 @@ public:
|
|||
|
||||
uint64_t al0 = h0[0] ^ h0[4];
|
||||
uint64_t al1 = h1[0] ^ h1[4];
|
||||
|
||||
uint64_t ah0 = h0[1] ^ h0[5];
|
||||
uint64_t ah1 = h1[1] ^ h1[5];
|
||||
|
||||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
|
||||
uint64_t idx0 = h0[0] ^h0[4];
|
||||
uint64_t idx1 = h1[0] ^h1[4];
|
||||
uint64_t idx[2];
|
||||
idx[0] = al0;
|
||||
idx[1] = al1;
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
__m128i cx0;
|
||||
__m128i cx1;
|
||||
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
|
||||
} else {
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
|
||||
__m128i ax0 = _mm_set_epi64x(ah0, al0);
|
||||
__m128i ax1 = _mm_set_epi64x(ah1, al1);
|
||||
|
||||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
|
||||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc((uint32_t *) &l0[idx[0] & MASK], ax0);
|
||||
cx1 = soft_aesenc((uint32_t *) &l1[idx[1] & MASK], ax1);
|
||||
} else {
|
||||
cx0 = _mm_load_si128((__m128i *) &l0[idx[0] & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i *) &l1[idx[1] & MASK]);
|
||||
cx0 = _mm_aesenc_si128(cx0, ax0);
|
||||
cx1 = _mm_aesenc_si128(cx1, ax1);
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
|
||||
_mm_store_si128((__m128i *) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i *) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1));
|
||||
|
||||
idx0 = EXTRACT64(cx0);
|
||||
idx1 = EXTRACT64(cx1);
|
||||
|
||||
bx0 = cx0;
|
||||
bx1 = cx1;
|
||||
idx[0] = EXTRACT64(cx0);
|
||||
idx[1] = EXTRACT64(cx1);
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
|
||||
cl = ((uint64_t*) &l0[idx[0] & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx[0] & MASK])[1];
|
||||
lo = __umul128(idx[0], cl, &hi);
|
||||
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
|
||||
((uint64_t*) &l0[idx0 & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[1] = ah0;
|
||||
|
||||
ah0 ^= ch;
|
||||
al0 ^= cl;
|
||||
idx0 = al0;
|
||||
idx[0] = al0;
|
||||
|
||||
int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
|
||||
int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2];
|
||||
int64_t q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
||||
idx0 = d ^ q;
|
||||
((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q;
|
||||
idx[0] = d ^ q;
|
||||
|
||||
bx0 = cx0;
|
||||
|
||||
cl = ((uint64_t*) &l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
|
||||
lo = __umul128(idx1, cl, &hi);
|
||||
cl = ((uint64_t*) &l1[idx[1] & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx[1] & MASK])[1];
|
||||
lo = __umul128(idx[1], cl, &hi);
|
||||
|
||||
al1 += hi;
|
||||
ah1 += lo;
|
||||
|
||||
((uint64_t*) &l1[idx1 & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[1] = ah1;
|
||||
|
||||
ah1 ^= ch;
|
||||
al1 ^= cl;
|
||||
idx1 = al1;
|
||||
idx[1] = al1;
|
||||
|
||||
n = ((int64_t*)&l1[idx1 & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx1 & MASK])[2];
|
||||
n = ((int64_t*)&l1[idx[1] & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx[1] & MASK])[2];
|
||||
q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
|
||||
idx1 = d ^ q;
|
||||
((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q;
|
||||
idx[1] = d ^ q;
|
||||
|
||||
bx1 = cx1;
|
||||
|
||||
}
|
||||
|
||||
cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||
|
@ -2572,76 +2579,78 @@ public:
|
|||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
|
||||
uint64_t idx0 = h0[0] ^h0[4];
|
||||
uint64_t idx1 = h1[0] ^h1[4];
|
||||
uint64_t idx[2];
|
||||
|
||||
idx[0] = al0;
|
||||
idx[1] = al1;
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
__m128i cx0;
|
||||
__m128i cx1;
|
||||
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0));
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1));
|
||||
} else {
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]);
|
||||
|
||||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
|
||||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
|
||||
_mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1));
|
||||
|
||||
idx0 = EXTRACT64(cx0);
|
||||
idx1 = EXTRACT64(cx1);
|
||||
idx[0] = EXTRACT64(cx0);
|
||||
idx[1] = EXTRACT64(cx1);
|
||||
|
||||
bx0 = cx0;
|
||||
bx1 = cx1;
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
cl = ((uint64_t*) &l0[idx[0] & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx[0] & MASK])[1];
|
||||
lo = __umul128(idx[0], cl, &hi);
|
||||
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
|
||||
((uint64_t*) &l0[idx0 & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[1] = ah0;
|
||||
|
||||
ah0 ^= ch;
|
||||
al0 ^= cl;
|
||||
idx0 = al0;
|
||||
idx[0] = al0;
|
||||
|
||||
int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
|
||||
int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2];
|
||||
int64_t q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
||||
idx0 = (~d) ^ q;
|
||||
((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q;
|
||||
idx[0] = (~d) ^ q;
|
||||
|
||||
|
||||
cl = ((uint64_t*) &l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
|
||||
lo = __umul128(idx1, cl, &hi);
|
||||
cl = ((uint64_t*) &l1[idx[1] & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx[1] & MASK])[1];
|
||||
lo = __umul128(idx[1], cl, &hi);
|
||||
|
||||
al1 += hi;
|
||||
ah1 += lo;
|
||||
|
||||
((uint64_t*) &l1[idx1 & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[1] = ah1;
|
||||
|
||||
ah1 ^= ch;
|
||||
al1 ^= cl;
|
||||
idx1 = al1;
|
||||
idx[1] = al1;
|
||||
|
||||
n = ((int64_t*)&l1[idx1 & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx1 & MASK])[2];
|
||||
n = ((int64_t*)&l1[idx[1] & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx[1] & MASK])[2];
|
||||
q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
|
||||
idx1 = (~d) ^ q;
|
||||
((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q;
|
||||
idx[1] = (~d) ^ q;
|
||||
}
|
||||
|
||||
cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||
|
@ -3464,9 +3473,10 @@ public:
|
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
|
||||
|
||||
uint64_t idx0 = h0[0] ^h0[4];
|
||||
uint64_t idx1 = h1[0] ^h1[4];
|
||||
uint64_t idx2 = h2[0] ^h2[4];
|
||||
uint64_t idx[3];
|
||||
idx[0] = al0;
|
||||
idx[1] = al1;
|
||||
idx[2] = al2;
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
__m128i cx0;
|
||||
|
@ -3474,26 +3484,26 @@ public:
|
|||
__m128i cx2;
|
||||
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
|
||||
cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0));
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1));
|
||||
cx2 = soft_aesenc((uint32_t*)&l2[idx[2] & MASK], _mm_set_epi64x(ah2, al2));
|
||||
} else {
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
|
||||
cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]);
|
||||
cx2 = _mm_load_si128((__m128i*) &l2[idx[2] & MASK]);
|
||||
|
||||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
|
||||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
|
||||
cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
|
||||
_mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
|
||||
_mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1));
|
||||
_mm_store_si128((__m128i*) &l2[idx[2] & MASK], _mm_xor_si128(bx2, cx2));
|
||||
|
||||
idx0 = EXTRACT64(cx0);
|
||||
idx1 = EXTRACT64(cx1);
|
||||
idx2 = EXTRACT64(cx2);
|
||||
idx[0] = EXTRACT64(cx0);
|
||||
idx[1] = EXTRACT64(cx1);
|
||||
idx[2] = EXTRACT64(cx2);
|
||||
|
||||
bx0 = cx0;
|
||||
bx1 = cx1;
|
||||
|
@ -3501,70 +3511,70 @@ public:
|
|||
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
cl = ((uint64_t*) &l0[idx[0] & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx[0] & MASK])[1];
|
||||
lo = __umul128(idx[0], cl, &hi);
|
||||
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
|
||||
((uint64_t*) &l0[idx0 & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[1] = ah0;
|
||||
|
||||
ah0 ^= ch;
|
||||
al0 ^= cl;
|
||||
idx0 = al0;
|
||||
idx[0] = al0;
|
||||
|
||||
int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
|
||||
int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2];
|
||||
int64_t q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
||||
idx0 = d ^ q;
|
||||
((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q;
|
||||
idx[0] = d ^ q;
|
||||
|
||||
|
||||
cl = ((uint64_t*) &l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
|
||||
lo = __umul128(idx1, cl, &hi);
|
||||
cl = ((uint64_t*) &l1[idx[1] & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx[1] & MASK])[1];
|
||||
lo = __umul128(idx[1], cl, &hi);
|
||||
|
||||
al1 += hi;
|
||||
ah1 += lo;
|
||||
|
||||
((uint64_t*) &l1[idx1 & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[1] = ah1;
|
||||
|
||||
ah1 ^= ch;
|
||||
al1 ^= cl;
|
||||
idx1 = al1;
|
||||
idx[1] = al1;
|
||||
|
||||
n = ((int64_t*)&l1[idx1 & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx1 & MASK])[2];
|
||||
n = ((int64_t*)&l1[idx[1] & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx[1] & MASK])[2];
|
||||
q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
|
||||
idx1 = d ^ q;
|
||||
((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q;
|
||||
idx[1] = d ^ q;
|
||||
|
||||
|
||||
cl = ((uint64_t*) &l2[idx2 & MASK])[0];
|
||||
ch = ((uint64_t*) &l2[idx2 & MASK])[1];
|
||||
lo = __umul128(idx2, cl, &hi);
|
||||
cl = ((uint64_t*) &l2[idx[2] & MASK])[0];
|
||||
ch = ((uint64_t*) &l2[idx[2] & MASK])[1];
|
||||
lo = __umul128(idx[2], cl, &hi);
|
||||
|
||||
al2 += hi;
|
||||
ah2 += lo;
|
||||
|
||||
((uint64_t*) &l2[idx2 & MASK])[0] = al2;
|
||||
((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
|
||||
((uint64_t*) &l2[idx[2] & MASK])[0] = al2;
|
||||
((uint64_t*) &l2[idx[2] & MASK])[1] = ah2;
|
||||
|
||||
ah2 ^= ch;
|
||||
al2 ^= cl;
|
||||
idx2 = al2;
|
||||
idx[2] = al2;
|
||||
|
||||
n = ((int64_t*)&l2[idx2 & MASK])[0];
|
||||
d = ((int32_t*)&l2[idx2 & MASK])[2];
|
||||
n = ((int64_t*)&l2[idx[2] & MASK])[0];
|
||||
d = ((int32_t*)&l2[idx[2] & MASK])[2];
|
||||
q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l2[idx2 & MASK])[0] = n ^ q;
|
||||
idx2 = d ^ q;
|
||||
((int64_t*)&l2[idx[2] & MASK])[0] = n ^ q;
|
||||
idx[2] = d ^ q;
|
||||
}
|
||||
|
||||
cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||
|
@ -3611,9 +3621,11 @@ public:
|
|||
__m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
__m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
|
||||
|
||||
uint64_t idx0 = h0[0] ^h0[4];
|
||||
uint64_t idx1 = h1[0] ^h1[4];
|
||||
uint64_t idx2 = h2[0] ^h2[4];
|
||||
uint64_t idx[2];
|
||||
|
||||
idx[0] = al0;
|
||||
idx[1] = al1;
|
||||
idx[2] = al2;
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
__m128i cx0;
|
||||
|
@ -3621,26 +3633,26 @@ public:
|
|||
__m128i cx2;
|
||||
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
|
||||
cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
|
||||
cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0));
|
||||
cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1));
|
||||
cx2 = soft_aesenc((uint32_t*)&l2[idx[2] & MASK], _mm_set_epi64x(ah2, al2));
|
||||
} else {
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
|
||||
cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]);
|
||||
cx2 = _mm_load_si128((__m128i*) &l2[idx[2] & MASK]);
|
||||
|
||||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
|
||||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
|
||||
cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
|
||||
}
|
||||
|
||||
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
|
||||
_mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
|
||||
_mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1));
|
||||
_mm_store_si128((__m128i*) &l2[idx[2] & MASK], _mm_xor_si128(bx2, cx2));
|
||||
|
||||
idx0 = EXTRACT64(cx0);
|
||||
idx1 = EXTRACT64(cx1);
|
||||
idx2 = EXTRACT64(cx2);
|
||||
idx[0] = EXTRACT64(cx0);
|
||||
idx[1] = EXTRACT64(cx1);
|
||||
idx[2] = EXTRACT64(cx2);
|
||||
|
||||
bx0 = cx0;
|
||||
bx1 = cx1;
|
||||
|
@ -3648,70 +3660,70 @@ public:
|
|||
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
cl = ((uint64_t*) &l0[idx[0] & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx[0] & MASK])[1];
|
||||
lo = __umul128(idx[0], cl, &hi);
|
||||
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
|
||||
((uint64_t*) &l0[idx0 & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx[0] & MASK])[1] = ah0;
|
||||
|
||||
ah0 ^= ch;
|
||||
al0 ^= cl;
|
||||
idx0 = al0;
|
||||
idx[0] = al0;
|
||||
|
||||
int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
|
||||
int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0];
|
||||
int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2];
|
||||
int64_t q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
||||
idx0 = (~d) ^ q;
|
||||
((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q;
|
||||
idx[0] = (~d) ^ q;
|
||||
|
||||
|
||||
cl = ((uint64_t*) &l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
|
||||
lo = __umul128(idx1, cl, &hi);
|
||||
cl = ((uint64_t*) &l1[idx[1] & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx[1] & MASK])[1];
|
||||
lo = __umul128(idx[1], cl, &hi);
|
||||
|
||||
al1 += hi;
|
||||
ah1 += lo;
|
||||
|
||||
((uint64_t*) &l1[idx1 & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[0] = al1;
|
||||
((uint64_t*) &l1[idx[1] & MASK])[1] = ah1;
|
||||
|
||||
ah1 ^= ch;
|
||||
al1 ^= cl;
|
||||
idx1 = al1;
|
||||
idx[1] = al1;
|
||||
|
||||
n = ((int64_t*)&l1[idx1 & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx1 & MASK])[2];
|
||||
n = ((int64_t*)&l1[idx[1] & MASK])[0];
|
||||
d = ((int32_t*)&l1[idx[1] & MASK])[2];
|
||||
q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
|
||||
idx1 = (~d) ^ q;
|
||||
((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q;
|
||||
idx[1] = (~d) ^ q;
|
||||
|
||||
|
||||
cl = ((uint64_t*) &l2[idx2 & MASK])[0];
|
||||
ch = ((uint64_t*) &l2[idx2 & MASK])[1];
|
||||
lo = __umul128(idx2, cl, &hi);
|
||||
cl = ((uint64_t*) &l2[idx[2] & MASK])[0];
|
||||
ch = ((uint64_t*) &l2[idx[2] & MASK])[1];
|
||||
lo = __umul128(idx[2], cl, &hi);
|
||||
|
||||
al2 += hi;
|
||||
ah2 += lo;
|
||||
|
||||
((uint64_t*) &l2[idx2 & MASK])[0] = al2;
|
||||
((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
|
||||
((uint64_t*) &l2[idx[2] & MASK])[0] = al2;
|
||||
((uint64_t*) &l2[idx[2] & MASK])[1] = ah2;
|
||||
|
||||
ah2 ^= ch;
|
||||
al2 ^= cl;
|
||||
idx2 = al2;
|
||||
idx[2] = al2;
|
||||
|
||||
n = ((int64_t*)&l2[idx2 & MASK])[0];
|
||||
d = ((int32_t*)&l2[idx2 & MASK])[2];
|
||||
n = ((int64_t*)&l2[idx[2] & MASK])[0];
|
||||
d = ((int32_t*)&l2[idx[2] & MASK])[2];
|
||||
q = n / (d | 0x5);
|
||||
|
||||
((int64_t*)&l2[idx2 & MASK])[0] = n ^ q;
|
||||
idx2 = (~d) ^ q;
|
||||
((int64_t*)&l2[idx[2] & MASK])[0] = n ^ q;
|
||||
idx[2] = (~d) ^ q;
|
||||
}
|
||||
|
||||
cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue