diff --git a/.gitignore b/.gitignore index 34695ef3..e9a0152c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,7 +5,7 @@ /.idea /CMakeFiles /src/3rdparty -/cmake-build-debug +/cmake-build-* CMakeCache.txt cmake_install.cmake Makefile diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index 3783b8d9..f6ab2935 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -105,7 +105,7 @@ bool CryptoNight::selfTest(int algo) { if (cryptonight_hash_ctx[0] == nullptr || cryptonight_hash_ctx[2] == nullptr || cryptonight_hash_ctx[2] == nullptr || cryptonight_hash_ctx[3] == nullptr || - cryptonight_hash_ctx[4] == nullptr) { + cryptonight_hash_ctx[4] == nullptr) { return false; } @@ -115,19 +115,19 @@ bool CryptoNight::selfTest(int algo) ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 6, 16); cryptonight_hash_ctx[0](test_input, 76, output, ctx); - bool resultSingle = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output1 : test_output0, 32) == 0; + bool resultSingle = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 32) == 0; cryptonight_hash_ctx[1](test_input, 76, output, ctx); - bool resultDouble = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output1 : test_output0, 64) == 0; + bool resultDouble = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 64) == 0; cryptonight_hash_ctx[2](test_input, 76, output, ctx); - bool resultTriple = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output1 : test_output0, 96) == 0; + bool resultTriple = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 96) == 0; cryptonight_hash_ctx[3](test_input, 76, output, ctx); - bool resultQuadruple = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output1 : test_output0, 128) == 0; + bool resultQuadruple = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 128) == 0; cryptonight_hash_ctx[4](test_input, 76, output, ctx); - bool resultQuintuple = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output1 : test_output0, 160) == 0; + bool resultQuintuple = memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 160) == 0; _mm_free(ctx->memory); _mm_free(ctx); diff --git a/src/crypto/CryptoNight_monero.h b/src/crypto/CryptoNight_monero.h new file mode 100644 index 00000000..504b9b9d --- /dev/null +++ b/src/crypto/CryptoNight_monero.h @@ -0,0 +1,57 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2016-2018 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __CRYPTONIGHT_MONERO_H__ +#define __CRYPTONIGHT_MONERO_H__ + +// VARIANT ALTERATIONS +#define VARIANT1_INIT(part) \ + const uint8_t version##part = static_cast(input)[part * size]; \ + uint64_t tweak1_2_##part = 0; \ + /*if (MONERO)*/ { \ + if (version##part > 6) { \ + tweak1_2_##part = (*reinterpret_cast(reinterpret_cast(input) + 35 + part * size) ^ \ + *(reinterpret_cast(ctx->state[part]) + 24)); \ + } \ + } + +#define VARIANT1_1(p, part) \ + /*if (MONERO)*/ { \ + if (version##part > 6) { \ + const uint8_t tmp = reinterpret_cast(p)[11]; \ + static const uint32_t table = 0x75310; \ + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ + ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \ + } \ + } + +#define VARIANT1_2(p, part) \ + /*if (MONERO)*/ { \ + if (version##part > 6) { \ + (p) ^= tweak1_2_##part; \ + } \ + } + + +#endif /* __CRYPTONIGHT_MONERO_H__ */ diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index ed2b3feb..dde9b247 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -25,7 +25,7 @@ #define __CRYPTONIGHT_TEST_H__ -const static uint8_t test_input[456] = { +const static uint8_t test_input[] = { 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, 0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, 0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, @@ -46,7 +46,7 @@ const static uint8_t test_input[456] = { 0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62, 0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92, 0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01, - 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, + 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, 0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, 0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, 0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46, @@ -59,7 +59,7 @@ const static uint8_t test_input[456] = { }; -const static uint8_t test_output0[192] = { +const static uint8_t test_output[] = { 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, @@ -68,14 +68,14 @@ const static uint8_t test_output0[192] = { 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, - 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, + 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00 }; -const static uint8_t test_output1[192] = { +const static uint8_t test_output_light[] = { 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, @@ -84,11 +84,16 @@ const static uint8_t test_output1[192] = { 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, - 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, + 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88 }; +const static uint8_t test_input_monero_slow[] = { + 0xb5, 0xa7, 0xf6, 0x3a, 0xbb, 0x94, 0xd0, 0x7d, 0x1a, 0x64, 0x45, 0xc3, 0x6c, 0x07, 0xc7, 0xe8, + 0x32, 0x7f, 0xe6, 0x1b, 0x16, 0x47, 0xe3, 0x91, 0xb4, 0xc7, 0xed, 0xae, 0x5d, 0xe5, 0x7a, 0x3d +}; + #endif /* __CRYPTONIGHT_TEST_H__ */ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 04686f0c..e86c114a 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -36,6 +36,7 @@ #include "crypto/CryptoNight.h" +#include "crypto/CryptoNight_monero.h" #include "crypto/soft_aes.h" @@ -48,7 +49,6 @@ extern "C" #include "crypto/c_skein.h" } - static inline void do_blake_hash(const void* input, size_t len, char* output) { blake256_hash(reinterpret_cast(output), static_cast(input), len); @@ -332,10 +332,18 @@ public: uint64_t ah[NUM_HASH_BLOCKS]; __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t tweak1_2[NUM_HASH_BLOCKS]; + uint64_t version[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, - ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + version[hashBlock] = static_cast(input)[hashBlock * size]; + /*if (MONERO)*/ { + if (version[hashBlock] > 6) { + tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ + *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + } + } } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { @@ -363,6 +371,16 @@ public: _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); + + /*if (MONERO)*/ { + if (version[hashBlock] > 6) { + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + } + } + idx[hashBlock] = EXTRACT64(cx); bx[hashBlock] = cx; @@ -374,9 +392,21 @@ public: al[hashBlock] += hi; ah[hashBlock] += lo; + /*if (MONERO)*/ { + if (version[hashBlock] > 6) { + ah[hashBlock] ^= tweak1_2[hashBlock]; + } + } + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + /*if (MONERO)*/ { + if (version[hashBlock] > 6) { + ah[hashBlock] ^= tweak1_2[hashBlock]; + } + } + ah[hashBlock] ^= ch; al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; @@ -410,6 +440,8 @@ public: keccak(static_cast(input), (int) size, ctx->state[0], 200); + VARIANT1_INIT(0); + l = ctx->memory; h = reinterpret_cast(ctx->state[0]); @@ -431,6 +463,7 @@ public: } _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + VARIANT1_1(&l[idx & MASK], 0); idx = EXTRACT64(cx); bx = cx; @@ -442,8 +475,10 @@ public: al += hi; ah += lo; + VARIANT1_2(ah, 0); ((uint64_t*) &l[idx & MASK])[0] = al; ((uint64_t*) &l[idx & MASK])[1] = ah; + VARIANT1_2(ah, 0); ah ^= ch; al ^= cl; @@ -468,6 +503,9 @@ public: keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + VARIANT1_INIT(0); + VARIANT1_INIT(1); + const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; uint64_t* h0 = reinterpret_cast(ctx->state[0]); @@ -505,6 +543,9 @@ public: _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + VARIANT1_1(&l0[idx0 & MASK], 0); + VARIANT1_1(&l1[idx1 & MASK], 1); + idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); @@ -519,8 +560,10 @@ public: al0 += hi; ah0 += lo; + VARIANT1_2(ah0, 0); ((uint64_t*) &l0[idx0 & MASK])[0] = al0; ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; @@ -533,8 +576,10 @@ public: al1 += hi; ah1 += lo; + VARIANT1_2(ah1, 1); ((uint64_t*) &l1[idx1 & MASK])[0] = al1; ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + VARIANT1_2(ah1, 1); ah1 ^= ch; al1 ^= cl; @@ -565,6 +610,10 @@ public: keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + VARIANT1_INIT(0); + VARIANT1_INIT(1); + VARIANT1_INIT(2); + const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; const uint8_t* l2 = ctx->memory + 2 * MEM; @@ -614,6 +663,10 @@ public: _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + VARIANT1_1(&l0[idx0 & MASK], 0); + VARIANT1_1(&l1[idx1 & MASK], 1); + VARIANT1_1(&l2[idx2 & MASK], 2); + idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); idx2 = EXTRACT64(cx2); @@ -631,8 +684,10 @@ public: al0 += hi; ah0 += lo; + VARIANT1_2(ah0, 0); ((uint64_t*) &l0[idx0 & MASK])[0] = al0; ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; @@ -646,8 +701,10 @@ public: al1 += hi; ah1 += lo; + VARIANT1_2(ah1, 1); ((uint64_t*) &l1[idx1 & MASK])[0] = al1; ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + VARIANT1_2(ah1, 1); ah1 ^= ch; al1 ^= cl; @@ -661,8 +718,10 @@ public: al2 += hi; ah2 += lo; + VARIANT1_2(ah2, 2); ((uint64_t*) &l2[idx2 & MASK])[0] = al2; ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + VARIANT1_2(ah2, 2); ah2 ^= ch; al2 ^= cl; @@ -697,6 +756,11 @@ public: keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + VARIANT1_INIT(0); + VARIANT1_INIT(1); + VARIANT1_INIT(2); + VARIANT1_INIT(3); + const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; const uint8_t* l2 = ctx->memory + 2 * MEM; @@ -758,6 +822,11 @@ public: _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + VARIANT1_1(&l0[idx0 & MASK], 0); + VARIANT1_1(&l1[idx1 & MASK], 1); + VARIANT1_1(&l2[idx2 & MASK], 2); + VARIANT1_1(&l3[idx3 & MASK], 3); + idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); idx2 = EXTRACT64(cx2); @@ -777,8 +846,10 @@ public: al0 += hi; ah0 += lo; + VARIANT1_2(ah0, 0); ((uint64_t*) &l0[idx0 & MASK])[0] = al0; ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; @@ -792,8 +863,10 @@ public: al1 += hi; ah1 += lo; + VARIANT1_2(ah1, 1); ((uint64_t*) &l1[idx1 & MASK])[0] = al1; ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + VARIANT1_2(ah1, 1); ah1 ^= ch; al1 ^= cl; @@ -807,8 +880,10 @@ public: al2 += hi; ah2 += lo; + VARIANT1_2(ah2, 2); ((uint64_t*) &l2[idx2 & MASK])[0] = al2; ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + VARIANT1_2(ah2, 2); ah2 ^= ch; al2 ^= cl; @@ -822,8 +897,10 @@ public: al3 += hi; ah3 += lo; + VARIANT1_2(ah3, 3); ((uint64_t*) &l3[idx3 & MASK])[0] = al3; ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + VARIANT1_2(ah3, 3); ah3 ^= ch; al3 ^= cl; @@ -862,6 +939,12 @@ public: keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200); + VARIANT1_INIT(0); + VARIANT1_INIT(1); + VARIANT1_INIT(2); + VARIANT1_INIT(3); + VARIANT1_INIT(4); + const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; const uint8_t* l2 = ctx->memory + 2 * MEM; @@ -935,6 +1018,12 @@ public: _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4)); + VARIANT1_1(&l0[idx0 & MASK], 0); + VARIANT1_1(&l1[idx1 & MASK], 1); + VARIANT1_1(&l2[idx2 & MASK], 2); + VARIANT1_1(&l3[idx3 & MASK], 3); + VARIANT1_1(&l4[idx4 & MASK], 4); + idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); idx2 = EXTRACT64(cx2); @@ -955,8 +1044,10 @@ public: al0 += hi; ah0 += lo; + VARIANT1_2(ah0, 0); ((uint64_t*) &l0[idx0 & MASK])[0] = al0; ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; @@ -970,8 +1061,10 @@ public: al1 += hi; ah1 += lo; + VARIANT1_2(ah1, 1); ((uint64_t*) &l1[idx1 & MASK])[0] = al1; ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + VARIANT1_2(ah1, 1); ah1 ^= ch; al1 ^= cl; @@ -985,8 +1078,10 @@ public: al2 += hi; ah2 += lo; + VARIANT1_2(ah2, 2); ((uint64_t*) &l2[idx2 & MASK])[0] = al2; ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + VARIANT1_2(ah2, 2); ah2 ^= ch; al2 ^= cl; @@ -1000,8 +1095,10 @@ public: al3 += hi; ah3 += lo; + VARIANT1_2(ah3, 3); ((uint64_t*) &l3[idx3 & MASK])[0] = al3; ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + VARIANT1_2(ah3, 3); ah3 ^= ch; al3 ^= cl; @@ -1015,8 +1112,10 @@ public: al4 += hi; ah4 += lo; + VARIANT1_2(ah4, 4); ((uint64_t*) &l4[idx4 & MASK])[0] = al4; ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + VARIANT1_2(ah4, 4); ah4 ^= ch; al4 ^= cl;