From 4001331310aeef21bdf6a6a0bdef91854e7f6c17 Mon Sep 17 00:00:00 2001 From: BenDr0id Date: Tue, 29 Jan 2019 16:49:08 +0100 Subject: [PATCH] Workaround multihash/self test of haevy variants when compiled with gcc8 --- cmake/flags.cmake | 6 +- src/crypto/CryptoNight.cpp | 10 + src/crypto/CryptoNight_test.h | 30 +-- src/crypto/CryptoNight_x86.h | 340 ++++++++++++++++++---------------- 4 files changed, 204 insertions(+), 182 deletions(-) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 488f1236..07dd1b98 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -9,10 +9,10 @@ endif() if (CMAKE_CXX_COMPILER_ID MATCHES GNU) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wall -Wno-strict-aliasing") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fno-exceptions -fno-rtti") - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -fno-exceptions -fno-rtti -Wno-class-memaccess") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Ofast -s") if (XMRIG_ARMv8) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crypto") diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index 03a43564..6c05e9a4 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -647,6 +647,16 @@ bool CryptoNight::selfTest(int algo) cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xfh, 32) == 0; + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_xfh, 64) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 2 + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_xfh, 96) == 0; + #endif + // cnv8 + xtl aka cn-fast2 cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads); diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 64f125bd..9ce43073 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -117,9 +117,13 @@ const static uint8_t test_output_alloy[32] = { }; // CN XFH -const static uint8_t test_output_xfh[32] = { - 0x40, 0x86, 0x5A, 0xA8, 0x87, 0x41, 0xEC, 0x1D, 0xCC, 0xBD, 0x2B, 0xC6, 0xFF, 0x36, 0xB9, 0x4D, - 0x54, 0x71, 0x58, 0xDB, 0x94, 0x69, 0x8E, 0x3C, 0xA0, 0x3D, 0xE4, 0x81, 0x9A, 0x65, 0x9F, 0xEF +const static uint8_t test_output_xfh[96] = { + 0x40, 0x86, 0x5A, 0xA8, 0x87, 0x41, 0xEC, 0x1D, 0xCC, 0xBD, 0x2B, 0xC6, 0xFF, 0x36, 0xB9, 0x4D, + 0x54, 0x71, 0x58, 0xDB, 0x94, 0x69, 0x8E, 0x3C, 0xA0, 0x3D, 0xE4, 0x81, 0x9A, 0x65, 0x9F, 0xEF, + 0x52, 0x97, 0x35, 0x9E, 0xF7, 0x62, 0x9B, 0x1A, 0x9A, 0x9F, 0xE5, 0x0E, 0x50, 0x5F, 0xE6, 0xFB, + 0xA4, 0xCF, 0x30, 0x47, 0xF7, 0xFE, 0x68, 0x4F, 0x7C, 0x87, 0x6A, 0xA6, 0x60, 0x3E, 0xD5, 0x3E, + 0xCC, 0x30, 0xE5, 0xAB, 0xF3, 0x6A, 0x25, 0xB1, 0xD8, 0x89, 0xB8, 0x87, 0xE3, 0x61, 0x1D, 0x1E, + 0xD5, 0x3D, 0x5D, 0x38, 0x29, 0x0F, 0x88, 0xDA, 0xE1, 0x69, 0x12, 0x4F, 0xA2, 0x4F, 0x24, 0x04 }; // CN XTL V9 @@ -181,17 +185,13 @@ const static uint8_t test_output_upx[32] = { }; // CN-Heavy -const static uint8_t test_output_heavy[160] = { +const static uint8_t test_output_heavy[96] = { 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, 0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2, 0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A, 0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D, 0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7, - 0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD, - 0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1, - 0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9, - 0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51, - 0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB + 0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD }; // CN-Heavy Haven @@ -206,12 +206,12 @@ const static uint8_t test_output_heavy_haven[96] = { // CN-Heavy Tube const static uint8_t test_output_heavy_tube[96] = { - 0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf, - 0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35, - 0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3, - 0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74, - 0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f, - 0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb + 0xFE, 0x53, 0x35, 0x20, 0x76, 0xEA, 0xE6, 0x89, 0xFA, 0x3B, 0x4F, 0xDA, 0x61, 0x46, 0x34, 0xCF, + 0xC3, 0x12, 0xEE, 0x0C, 0x38, 0x7D, 0xF2, 0xB8, 0xB7, 0x4D, 0xA2, 0xA1, 0x59, 0x74, 0x12, 0x35, + 0xCD, 0x3F, 0x29, 0xDF, 0x07, 0x4A, 0x14, 0xAD, 0x0B, 0x98, 0x99, 0x37, 0xCA, 0x14, 0x68, 0xA3, + 0x8D, 0xAE, 0x86, 0xC1, 0xA3, 0x54, 0x05, 0xBE, 0xEA, 0x6D, 0x29, 0x24, 0x0C, 0x82, 0x97, 0x74, + 0xA0, 0x64, 0x77, 0xCD, 0x8D, 0x8A, 0xC3, 0x10, 0xB4, 0x89, 0x0E, 0xBB, 0x7D, 0xE6, 0x32, 0x8F, + 0xF4, 0x2D, 0xB6, 0x9E, 0x8A, 0xF9, 0xF8, 0xEE, 0x2C, 0xD0, 0x74, 0xED, 0xA9, 0xAA, 0xA1, 0xFB }; // CN-Ultralite/Turtle diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index fb2f1217..8c5ed80d 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -2447,95 +2447,102 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); - keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + keccak(static_cast(input) + size, (int) size, scratchPad[1]->state, 200); const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + + const uint8_t* l1 = scratchPad[1]->memory; uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx[2]; + idx[0] = al0; + idx[1] = al1; for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx0; __m128i cx1; - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + __m128i ax0 = _mm_set_epi64x(ah0, al0); + __m128i ax1 = _mm_set_epi64x(ah1, al1); - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t *) &l0[idx[0] & MASK], ax0); + cx1 = soft_aesenc((uint32_t *) &l1[idx[1] & MASK], ax1); + } else { + cx0 = _mm_load_si128((__m128i *) &l0[idx[0] & MASK]); + cx1 = _mm_load_si128((__m128i *) &l1[idx[1] & MASK]); + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); } - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i *) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i *) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1)); - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - - bx0 = cx0; - bx1 = cx1; + idx[0] = EXTRACT64(cx0); + idx[1] = EXTRACT64(cx1); uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); + + cl = ((uint64_t*) &l0[idx[0] & MASK])[0]; + ch = ((uint64_t*) &l0[idx[0] & MASK])[1]; + lo = __umul128(idx[0], cl, &hi); al0 += hi; ah0 += lo; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) &l0[idx[0] & MASK])[0] = al0; + ((uint64_t*) &l0[idx[0] & MASK])[1] = ah0; ah0 ^= ch; al0 ^= cl; - idx0 = al0; + idx[0] = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; + idx[0] = d ^ q; + bx0 = cx0; - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); + cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; + ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; + lo = __umul128(idx[1], cl, &hi); al1 += hi; ah1 += lo; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ((uint64_t*) &l1[idx[1] & MASK])[0] = al1; + ((uint64_t*) &l1[idx[1] & MASK])[1] = ah1; ah1 ^= ch; al1 ^= cl; - idx1 = al1; + idx[1] = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; + n = ((int64_t*)&l1[idx[1] & MASK])[0]; + d = ((int32_t*)&l1[idx[1] & MASK])[2]; q = n / (d | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; + ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; + idx[1] = d ^ q; + + bx1 = cx1; + } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -2572,76 +2579,78 @@ public: __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx[2]; + + idx[0] = al0; + idx[1] = al1; for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx0; __m128i cx1; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1)); } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]); cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1)); - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); + idx[0] = EXTRACT64(cx0); + idx[1] = EXTRACT64(cx1); bx0 = cx0; bx1 = cx1; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); + cl = ((uint64_t*) &l0[idx[0] & MASK])[0]; + ch = ((uint64_t*) &l0[idx[0] & MASK])[1]; + lo = __umul128(idx[0], cl, &hi); al0 += hi; ah0 += lo; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) &l0[idx[0] & MASK])[0] = al0; + ((uint64_t*) &l0[idx[0] & MASK])[1] = ah0; ah0 ^= ch; al0 ^= cl; - idx0 = al0; + idx[0] = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; + ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; + idx[0] = (~d) ^ q; - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); + cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; + ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; + lo = __umul128(idx[1], cl, &hi); al1 += hi; ah1 += lo; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ((uint64_t*) &l1[idx[1] & MASK])[0] = al1; + ((uint64_t*) &l1[idx[1] & MASK])[1] = ah1; ah1 ^= ch; al1 ^= cl; - idx1 = al1; + idx[1] = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; + n = ((int64_t*)&l1[idx[1] & MASK])[0]; + d = ((int32_t*)&l1[idx[1] & MASK])[2]; q = n / (d | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = (~d) ^ q; + ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; + idx[1] = (~d) ^ q; } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -3464,9 +3473,10 @@ public: __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx[3]; + idx[0] = al0; + idx[1] = al1; + idx[2] = al2; for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx0; @@ -3474,26 +3484,26 @@ public: __m128i cx2; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx[2] & MASK], _mm_set_epi64x(ah2, al2)); } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx[2] & MASK]); cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); } - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx[2] & MASK], _mm_xor_si128(bx2, cx2)); - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); + idx[0] = EXTRACT64(cx0); + idx[1] = EXTRACT64(cx1); + idx[2] = EXTRACT64(cx2); bx0 = cx0; bx1 = cx1; @@ -3501,70 +3511,70 @@ public: uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); + cl = ((uint64_t*) &l0[idx[0] & MASK])[0]; + ch = ((uint64_t*) &l0[idx[0] & MASK])[1]; + lo = __umul128(idx[0], cl, &hi); al0 += hi; ah0 += lo; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) &l0[idx[0] & MASK])[0] = al0; + ((uint64_t*) &l0[idx[0] & MASK])[1] = ah0; ah0 ^= ch; al0 ^= cl; - idx0 = al0; + idx[0] = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; + idx[0] = d ^ q; - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); + cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; + ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; + lo = __umul128(idx[1], cl, &hi); al1 += hi; ah1 += lo; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ((uint64_t*) &l1[idx[1] & MASK])[0] = al1; + ((uint64_t*) &l1[idx[1] & MASK])[1] = ah1; ah1 ^= ch; al1 ^= cl; - idx1 = al1; + idx[1] = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; + n = ((int64_t*)&l1[idx[1] & MASK])[0]; + d = ((int32_t*)&l1[idx[1] & MASK])[2]; q = n / (d | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; + ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; + idx[1] = d ^ q; - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); + cl = ((uint64_t*) &l2[idx[2] & MASK])[0]; + ch = ((uint64_t*) &l2[idx[2] & MASK])[1]; + lo = __umul128(idx[2], cl, &hi); al2 += hi; ah2 += lo; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ((uint64_t*) &l2[idx[2] & MASK])[0] = al2; + ((uint64_t*) &l2[idx[2] & MASK])[1] = ah2; ah2 ^= ch; al2 ^= cl; - idx2 = al2; + idx[2] = al2; - n = ((int64_t*)&l2[idx2 & MASK])[0]; - d = ((int32_t*)&l2[idx2 & MASK])[2]; + n = ((int64_t*)&l2[idx[2] & MASK])[0]; + d = ((int32_t*)&l2[idx[2] & MASK])[2]; q = n / (d | 0x5); - ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; - idx2 = d ^ q; + ((int64_t*)&l2[idx[2] & MASK])[0] = n ^ q; + idx[2] = d ^ q; } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -3611,9 +3621,11 @@ public: __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx[2]; + + idx[0] = al0; + idx[1] = al1; + idx[2] = al2; for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx0; @@ -3621,26 +3633,26 @@ public: __m128i cx2; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx[2] & MASK], _mm_set_epi64x(ah2, al2)); } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx[2] & MASK]); cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); } - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx[2] & MASK], _mm_xor_si128(bx2, cx2)); - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); + idx[0] = EXTRACT64(cx0); + idx[1] = EXTRACT64(cx1); + idx[2] = EXTRACT64(cx2); bx0 = cx0; bx1 = cx1; @@ -3648,70 +3660,70 @@ public: uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); + cl = ((uint64_t*) &l0[idx[0] & MASK])[0]; + ch = ((uint64_t*) &l0[idx[0] & MASK])[1]; + lo = __umul128(idx[0], cl, &hi); al0 += hi; ah0 += lo; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) &l0[idx[0] & MASK])[0] = al0; + ((uint64_t*) &l0[idx[0] & MASK])[1] = ah0; ah0 ^= ch; al0 ^= cl; - idx0 = al0; + idx[0] = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; + ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; + idx[0] = (~d) ^ q; - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); + cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; + ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; + lo = __umul128(idx[1], cl, &hi); al1 += hi; ah1 += lo; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ((uint64_t*) &l1[idx[1] & MASK])[0] = al1; + ((uint64_t*) &l1[idx[1] & MASK])[1] = ah1; ah1 ^= ch; al1 ^= cl; - idx1 = al1; + idx[1] = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; + n = ((int64_t*)&l1[idx[1] & MASK])[0]; + d = ((int32_t*)&l1[idx[1] & MASK])[2]; q = n / (d | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = (~d) ^ q; + ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; + idx[1] = (~d) ^ q; - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); + cl = ((uint64_t*) &l2[idx[2] & MASK])[0]; + ch = ((uint64_t*) &l2[idx[2] & MASK])[1]; + lo = __umul128(idx[2], cl, &hi); al2 += hi; ah2 += lo; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ((uint64_t*) &l2[idx[2] & MASK])[0] = al2; + ((uint64_t*) &l2[idx[2] & MASK])[1] = ah2; ah2 ^= ch; al2 ^= cl; - idx2 = al2; + idx[2] = al2; - n = ((int64_t*)&l2[idx2 & MASK])[0]; - d = ((int32_t*)&l2[idx2 & MASK])[2]; + n = ((int64_t*)&l2[idx[2] & MASK])[0]; + d = ((int32_t*)&l2[idx[2] & MASK])[2]; q = n / (d | 0x5); - ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; - idx2 = (~d) ^ q; + ((int64_t*)&l2[idx[2] & MASK])[0] = n ^ q; + idx[2] = (~d) ^ q; } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0);