From 4abcf937b22cd07590eb34090b4ae1a627ed966b Mon Sep 17 00:00:00 2001 From: aegroto Date: Mon, 9 Apr 2018 01:13:09 +0200 Subject: [PATCH 01/11] performance improvements --- src/crypto/CryptoNight_monero.h | 5 ++- src/crypto/CryptoNight_x86.h | 61 ++++++++++++++++++--------------- 2 files changed, 36 insertions(+), 30 deletions(-) diff --git a/src/crypto/CryptoNight_monero.h b/src/crypto/CryptoNight_monero.h index a667a3b3..fd3bc7d8 100644 --- a/src/crypto/CryptoNight_monero.h +++ b/src/crypto/CryptoNight_monero.h @@ -46,10 +46,9 @@ #define VARIANT1_1(p) \ if (VARIANT > 0) { \ - const uint8_t tmp = reinterpret_cast(p)[11]; \ static const uint32_t table = 0x75310; \ - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ - ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \ + const uint8_t tmp = reinterpret_cast(p)[11]; \ + ((uint8_t*)(p))[11] = tmp ^ ((table >> ((((tmp >> 3) & 6) | (tmp & 1)) << 1)) & 0x30); \ } #define VARIANT1_2(p, part) \ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 414a1f7f..0329d634 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -266,7 +266,9 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) } } - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i)); + + for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) { aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -278,14 +280,14 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - _mm_store_si128(output + i + 0, xin0); - _mm_store_si128(output + i + 1, xin1); - _mm_store_si128(output + i + 2, xin2); - _mm_store_si128(output + i + 3, xin3); - _mm_store_si128(output + i + 4, xin4); - _mm_store_si128(output + i + 5, xin5); - _mm_store_si128(output + i + 6, xin6); - _mm_store_si128(output + i + 7, xin7); + _mm_store_si128(outputTmp, xin0); + _mm_store_si128(outputTmp + 1, xin1); + _mm_store_si128(outputTmp + 2, xin2); + _mm_store_si128(outputTmp + 3, xin3); + _mm_store_si128(outputTmp + 4, xin4); + _mm_store_si128(outputTmp + 5, xin5); + _mm_store_si128(outputTmp + 6, xin6); + _mm_store_si128(outputTmp + 7, xin7); } } @@ -307,16 +309,18 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); + + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -335,15 +339,18 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); + + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) + { + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); From 80796c4b482e2cb9e4ed9be74282ed5f95274f78 Mon Sep 17 00:00:00 2001 From: aegroto Date: Mon, 9 Apr 2018 09:48:45 +0200 Subject: [PATCH 02/11] performance improvement to memory hard loop --- src/crypto/CryptoNight_x86.h | 67 +++++++++++++------------------ src/crypto/CryptoNight_x86_loop.h | 62 ++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 39 deletions(-) create mode 100644 src/crypto/CryptoNight_x86_loop.h diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 0329d634..43b815e7 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -37,6 +37,7 @@ #include "crypto/CryptoNight.h" #include "crypto/CryptoNight_constants.h" #include "crypto/CryptoNight_monero.h" +#include "crypto/CryptoNight_x86_loop.h" #include "crypto/soft_aes.h" @@ -122,7 +123,6 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin } #endif - // This will shift and xor tmp1 into itself as 4 32-bit vals such as // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) static inline __m128i sl_xor(__m128i tmp1) @@ -266,7 +266,7 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) } } - __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i)); + const __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i)); for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) { aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -309,7 +309,7 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); - __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); + const __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { @@ -339,8 +339,6 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); - for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); @@ -419,46 +417,37 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; + void* memoryPointer = ((uint8_t*) l0) + ((idx0) & MASK); - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + if(SOFT_AES && ALGO == xmrig::CRYPTONIGHT_HEAVY) { + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; + SINGLEHASH_LOOP_SOFTAES + SINGLEHASH_LOOP_COMMON + SINGLEHASH_LOOP_CNHEAVY } - else { - cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + } else if(!SOFT_AES && ALGO == xmrig::CRYPTONIGHT_HEAVY) { + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; + SINGLEHASH_LOOP_HARDAES + SINGLEHASH_LOOP_COMMON + SINGLEHASH_LOOP_CNHEAVY } - _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - VARIANT1_1(&l0[idx0 & MASK]); - idx0 = EXTRACT64(cx); - bx0 = cx; + } else { + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); + if (SOFT_AES) { + SINGLEHASH_LOOP_SOFTAES + } else { + SINGLEHASH_LOOP_HARDAES + } - al0 += hi; - ah0 += lo; + SINGLEHASH_LOOP_COMMON - VARIANT1_2(ah0, 0); - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; - VARIANT1_2(ah0, 0); - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { + SINGLEHASH_LOOP_CNHEAVY + } } } diff --git a/src/crypto/CryptoNight_x86_loop.h b/src/crypto/CryptoNight_x86_loop.h new file mode 100644 index 00000000..fc44a145 --- /dev/null +++ b/src/crypto/CryptoNight_x86_loop.h @@ -0,0 +1,62 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2018 aegroto + * Copyright 2016-2018 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __CRYPTONIGHT_X86_LOOP_H__ +#define __CRYPTONIGHT_X86_LOOP_H__ + +#define SINGLEHASH_LOOP_COMMON \ + _mm_store_si128((__m128i *) memoryPointer, _mm_xor_si128(bx0, cx)); \ + VARIANT1_1(memoryPointer); \ + idx0 = EXTRACT64(cx); \ + memoryPointer = ((uint8_t*) l0) + ((idx0) & MASK); \ + bx0 = cx; \ + uint64_t hi, lo, cl, ch; \ + cl = ((uint64_t*) memoryPointer)[0]; \ + ch = ((uint64_t*) memoryPointer)[1]; \ + lo = __umul128(idx0, cl, &hi); \ + al0 += hi; \ + ah0 += lo; \ + VARIANT1_2(ah0, 0); \ + ((uint64_t*) memoryPointer)[0] = al0; \ + ((uint64_t*) memoryPointer)[1] = ah0; \ + VARIANT1_2(ah0, 0); \ + ah0 ^= ch; \ + al0 ^= cl; \ + memoryPointer = ((uint8_t*) l0) + ((al0) & MASK); + +#define SINGLEHASH_LOOP_CNHEAVY \ + int64_t n = ((int64_t*)memoryPointer)[0]; \ + int32_t d = ((int32_t*)memoryPointer)[2]; \ + int64_t q = n / (d | 0x5); \ + ((int64_t*) memoryPointer)[0] = n ^ q; + +#define SINGLEHASH_LOOP_SOFTAES \ + cx = soft_aesenc((uint32_t*) memoryPointer, _mm_set_epi64x(ah0, al0)); + +#define SINGLEHASH_LOOP_HARDAES \ + cx = _mm_load_si128((__m128i *) memoryPointer); \ + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + +#endif /* __CRYPTONIGHT_X86_LOOP_H__ */ \ No newline at end of file From d83320c321aa4d73a1a0def7108387677a0dff78 Mon Sep 17 00:00:00 2001 From: aegroto Date: Mon, 9 Apr 2018 15:52:26 +0200 Subject: [PATCH 03/11] reverted loop define refactoring, applied other little patches --- src/crypto/CryptoNight_x86.h | 62 +++++++++++++++++-------------- src/crypto/CryptoNight_x86_loop.h | 62 ------------------------------- 2 files changed, 35 insertions(+), 89 deletions(-) delete mode 100644 src/crypto/CryptoNight_x86_loop.h diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 43b815e7..502d6e3b 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -6,6 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett + * Copyright 2018 aegroto * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -37,7 +38,6 @@ #include "crypto/CryptoNight.h" #include "crypto/CryptoNight_constants.h" #include "crypto/CryptoNight_monero.h" -#include "crypto/CryptoNight_x86_loop.h" #include "crypto/soft_aes.h" @@ -417,37 +417,45 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; - void* memoryPointer = ((uint8_t*) l0) + ((idx0) & MASK); + void* mp = ((uint8_t*) l0) + ((idx0) & MASK); + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; - if(SOFT_AES && ALGO == xmrig::CRYPTONIGHT_HEAVY) { - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - SINGLEHASH_LOOP_SOFTAES - SINGLEHASH_LOOP_COMMON - SINGLEHASH_LOOP_CNHEAVY + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); + } else { + cx = _mm_load_si128((__m128i *) mp); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); } - } else if(!SOFT_AES && ALGO == xmrig::CRYPTONIGHT_HEAVY) { - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - SINGLEHASH_LOOP_HARDAES - SINGLEHASH_LOOP_COMMON - SINGLEHASH_LOOP_CNHEAVY - } - } else { - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - if (SOFT_AES) { - SINGLEHASH_LOOP_SOFTAES - } else { - SINGLEHASH_LOOP_HARDAES - } + _mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx)); + VARIANT1_1(mp); + mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK); + bx0 = cx; - SINGLEHASH_LOOP_COMMON + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) mp)[0]; + ch = ((uint64_t*) mp)[1]; + lo = __umul128(idx0, cl, &hi); - if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - SINGLEHASH_LOOP_CNHEAVY - } + al0 += hi; + ah0 += lo; + + VARIANT1_2(ah0, 0); + ((uint64_t*) mp)[0] = al0; + ((uint64_t*) mp)[1] = ah0; + VARIANT1_2(ah0, 0); + + ah0 ^= ch; + al0 ^= cl; + mp = ((uint8_t*) l0) + ((al0) & MASK); + + if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { + int64_t n = ((int64_t*)mp)[0]; + int32_t d = ((int32_t*)mp)[2]; + int64_t q = n / (d | 0x5); + ((int64_t*) mp)[0] = n ^ q; } } diff --git a/src/crypto/CryptoNight_x86_loop.h b/src/crypto/CryptoNight_x86_loop.h deleted file mode 100644 index fc44a145..00000000 --- a/src/crypto/CryptoNight_x86_loop.h +++ /dev/null @@ -1,62 +0,0 @@ -/* XMRig - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * Copyright 2014 Lucas Jones - * Copyright 2014-2016 Wolf9466 - * Copyright 2016 Jay D Dee - * Copyright 2017-2018 XMR-Stak , - * Copyright 2018 Lee Clagett - * Copyright 2018 aegroto - * Copyright 2016-2018 XMRig , - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#ifndef __CRYPTONIGHT_X86_LOOP_H__ -#define __CRYPTONIGHT_X86_LOOP_H__ - -#define SINGLEHASH_LOOP_COMMON \ - _mm_store_si128((__m128i *) memoryPointer, _mm_xor_si128(bx0, cx)); \ - VARIANT1_1(memoryPointer); \ - idx0 = EXTRACT64(cx); \ - memoryPointer = ((uint8_t*) l0) + ((idx0) & MASK); \ - bx0 = cx; \ - uint64_t hi, lo, cl, ch; \ - cl = ((uint64_t*) memoryPointer)[0]; \ - ch = ((uint64_t*) memoryPointer)[1]; \ - lo = __umul128(idx0, cl, &hi); \ - al0 += hi; \ - ah0 += lo; \ - VARIANT1_2(ah0, 0); \ - ((uint64_t*) memoryPointer)[0] = al0; \ - ((uint64_t*) memoryPointer)[1] = ah0; \ - VARIANT1_2(ah0, 0); \ - ah0 ^= ch; \ - al0 ^= cl; \ - memoryPointer = ((uint8_t*) l0) + ((al0) & MASK); - -#define SINGLEHASH_LOOP_CNHEAVY \ - int64_t n = ((int64_t*)memoryPointer)[0]; \ - int32_t d = ((int32_t*)memoryPointer)[2]; \ - int64_t q = n / (d | 0x5); \ - ((int64_t*) memoryPointer)[0] = n ^ q; - -#define SINGLEHASH_LOOP_SOFTAES \ - cx = soft_aesenc((uint32_t*) memoryPointer, _mm_set_epi64x(ah0, al0)); - -#define SINGLEHASH_LOOP_HARDAES \ - cx = _mm_load_si128((__m128i *) memoryPointer); \ - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); - -#endif /* __CRYPTONIGHT_X86_LOOP_H__ */ \ No newline at end of file From 46d20338cb4b0dc20c19bbe680d8cd90a62a321d Mon Sep 17 00:00:00 2001 From: aegroto Date: Mon, 9 Apr 2018 16:35:08 +0200 Subject: [PATCH 04/11] applied patches to cryptonight arm --- src/crypto/CryptoNight_arm.h | 89 +++++++++++++++++++----------------- src/crypto/CryptoNight_x86.h | 6 +-- 2 files changed, 48 insertions(+), 47 deletions(-) diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index fd8b58ff..9fb8c42b 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -7,6 +7,7 @@ * Copyright 2016 Imran Yusuff * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett + * Copyright 2018 aegroto * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -272,7 +273,9 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) } } - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + const __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i)); + + for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) { if (!SOFT_AES) { aes_round(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); } @@ -301,14 +304,14 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); } - _mm_store_si128(output + i + 0, xin0); - _mm_store_si128(output + i + 1, xin1); - _mm_store_si128(output + i + 2, xin2); - _mm_store_si128(output + i + 3, xin3); - _mm_store_si128(output + i + 4, xin4); - _mm_store_si128(output + i + 5, xin5); - _mm_store_si128(output + i + 6, xin6); - _mm_store_si128(output + i + 7, xin7); + _mm_store_si128(outputTmp, xin0); + _mm_store_si128(outputTmp + 1, xin1); + _mm_store_si128(outputTmp + 2, xin2); + _mm_store_si128(outputTmp + 3, xin3); + _mm_store_si128(outputTmp + 4, xin4); + _mm_store_si128(outputTmp + 5, xin5); + _mm_store_si128(outputTmp + 6, xin6); + _mm_store_si128(outputTmp + 7, xin7); } } @@ -330,16 +333,18 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + const __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); + + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); if (!SOFT_AES) { aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -375,15 +380,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); if (!SOFT_AES) { aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -486,49 +491,47 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; + void* mp = ((uint8_t*) l0) + ((idx0) & MASK); for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - } - else { - cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); + } else { + cx = _mm_load_si128((__m128i *) mp); # ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); + cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); # endif } _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - VARIANT1_1(&l0[idx0 & MASK]); - idx0 = EXTRACT64(cx); + VARIANT1_1(mp); + mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK); bx0 = cx; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + cl = ((uint64_t*) mp)[0]; + ch = ((uint64_t*) mp)[1]; lo = __umul128(idx0, cl, &hi); al0 += hi; ah0 += lo; VARIANT1_2(ah0, 0); - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) mp)[0] = al0; + ((uint64_t*) mp)[1] = ah0; VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; - idx0 = al0; if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*) mp)[0]; + int32_t d = ((int32_t*) mp)[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*)mp)[0] = n ^ q; } } diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 502d6e3b..95422449 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -311,8 +311,7 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) const __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); - for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) - { + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); @@ -339,8 +338,7 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) - { + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); From 2fcc2a48ce234ad45024630babf95db88891d16e Mon Sep 17 00:00:00 2001 From: aegroto Date: Mon, 9 Apr 2018 15:32:08 +0000 Subject: [PATCH 05/11] fallback for arm patch, needs fix --- src/crypto/CryptoNight_arm.h | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 9fb8c42b..8abc8afd 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -491,47 +491,49 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; - void* mp = ((uint8_t*) l0) + ((idx0) & MASK); for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); - } else { - cx = _mm_load_si128((__m128i *) mp); + cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + } + else { + cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); # ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); + cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); # endif } _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - VARIANT1_1(mp); - mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK); + VARIANT1_1(&l0[idx0 & MASK]); + idx0 = EXTRACT64(cx); bx0 = cx; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) mp)[0]; - ch = ((uint64_t*) mp)[1]; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; lo = __umul128(idx0, cl, &hi); al0 += hi; ah0 += lo; VARIANT1_2(ah0, 0); - ((uint64_t*) mp)[0] = al0; - ((uint64_t*) mp)[1] = ah0; + ((uint64_t*)&l0[idx0 & MASK])[0] = al0; + ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; + idx0 = al0; if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*) mp)[0]; - int32_t d = ((int32_t*) mp)[2]; + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; int64_t q = n / (d | 0x5); - ((int64_t*)mp)[0] = n ^ q; + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; } } From 69997a3a06073144de2f58e4bdea490a7704a52e Mon Sep 17 00:00:00 2001 From: aegroto Date: Mon, 9 Apr 2018 21:06:23 +0200 Subject: [PATCH 06/11] cryptonight x86 modifications --- src/crypto/CryptoNight_x86.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 95422449..c44635b0 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -416,7 +416,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si uint64_t idx0 = h0[0] ^ h0[4]; void* mp = ((uint8_t*) l0) + ((idx0) & MASK); - + for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; From 574673fe43f8e25ebf4a7a09c4fb87ad227426ff Mon Sep 17 00:00:00 2001 From: aegroto Date: Tue, 10 Apr 2018 09:03:56 +0200 Subject: [PATCH 07/11] applied the single hash patch to the double hash code --- src/crypto/CryptoNight_x86.h | 60 ++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 628676c0..333e2191 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -517,83 +517,83 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si uint64_t idx0 = h0[0] ^ h0[4]; uint64_t idx1 = h1[0] ^ h1[4]; + void* mp0 = ((uint8_t*) l0) + ((idx0) & MASK); + void* mp1 = ((uint8_t*) l1) + ((idx1) & MASK); + for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx0, cx1; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - } - else { - cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); + cx0 = soft_aesenc((uint32_t*) mp0, _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) mp1, _mm_set_epi64x(ah1, al1)); + } else { + cx0 = _mm_load_si128((__m128i *) mp0); + cx1 = _mm_load_si128((__m128i *)mp1); cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } if (VARIANT > 0) { - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - cryptonight_monero_tweak((uint64_t*)&l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + cryptonight_monero_tweak((uint64_t*)mp0, _mm_xor_si128(bx0, cx0)); + cryptonight_monero_tweak((uint64_t*)mp1, _mm_xor_si128(bx1, cx1)); } else { - _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i *) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i *) mp0, _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i *) mp1, _mm_xor_si128(bx1, cx1)); } - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); + mp0 = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx0)) & MASK); + mp1 = ((uint8_t*) l1) + ((idx1 = EXTRACT64(cx1)) & MASK); bx0 = cx0; bx1 = cx1; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + cl = ((uint64_t*) mp0)[0]; + ch = ((uint64_t*) mp0)[1]; lo = __umul128(idx0, cl, &hi); al0 += hi; ah0 += lo; VARIANT1_2(ah0, 0); - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) mp0)[0] = al0; + ((uint64_t*) mp0)[1] = ah0; VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; - idx0 = al0; + mp0 = ((uint8_t*) l0) + ((al0) & MASK); if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*)mp0)[0]; + int32_t d = ((int32_t*)mp0)[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*) mp0)[0] = n ^ q; } - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + cl = ((uint64_t*) mp1)[0]; + ch = ((uint64_t*) mp1)[1]; lo = __umul128(idx1, cl, &hi); al1 += hi; ah1 += lo; VARIANT1_2(ah1, 1); - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ((uint64_t*) mp1)[0] = al1; + ((uint64_t*) mp1)[1] = ah1; VARIANT1_2(ah1, 1); ah1 ^= ch; al1 ^= cl; - idx1 = al1; + mp1 = ((uint8_t*) l1) + ((al1) & MASK); if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l1[idx1 & MASK])[0]; - int32_t d = ((int32_t*)&l1[idx1 & MASK])[2]; + int64_t n = ((int64_t*)mp1)[0]; + int32_t d = ((int32_t*)mp1)[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; + ((int64_t*)mp1)[0] = n ^ q; } } From 89ea30bc9d7e6922f88f28974fa60a8990c0f122 Mon Sep 17 00:00:00 2001 From: aegroto Date: Tue, 10 Apr 2018 07:16:06 +0000 Subject: [PATCH 08/11] arm patch fix --- src/crypto/CryptoNight_arm.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 8abc8afd..afccb0a7 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -491,49 +491,49 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; + void* mp = (uint8_t*) l0 + (idx0 & MASK); for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); } else { - cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx = _mm_load_si128((__m128i *) mp); # ifndef XMRIG_ARMv7 cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); # endif } - _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - VARIANT1_1(&l0[idx0 & MASK]); - idx0 = EXTRACT64(cx); + _mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx)); + VARIANT1_1(mp); + mp = (uint8_t*) l0 + ((idx0 = EXTRACT64(cx)) & MASK); bx0 = cx; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + cl = ((uint64_t*) mp)[0]; + ch = ((uint64_t*) mp)[1]; lo = __umul128(idx0, cl, &hi); al0 += hi; ah0 += lo; VARIANT1_2(ah0, 0); - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) mp)[0] = al0; + ((uint64_t*) mp)[1] = ah0; VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; - idx0 = al0; + mp = (uint8_t*) l0 + (al0 & MASK); if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*) mp)[0]; + int32_t d = ((int32_t*) mp)[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*) mp)[0] = n ^ q; } } From 33e710d591c71d53bf67a171c067677cbbc59b5e Mon Sep 17 00:00:00 2001 From: aegroto Date: Mon, 16 Apr 2018 01:39:27 +0200 Subject: [PATCH 09/11] optimization to double hash tweak --- src/crypto/CryptoNight_x86.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 333e2191..28d3b402 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -398,8 +398,7 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) uint8_t x = vh >> 24; static const uint16_t table = 0x7531; - const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; - vh ^= ((table >> index) & 0x3) << 28; + vh ^= ((table >> ((((x >> 3) & 6) | (x & 1)) << 1)) & 0x3) << 28; mem_out[1] = vh; } From bf123c7e360df01e87e7ab34d4b9fd9e8d2d802c Mon Sep 17 00:00:00 2001 From: aegroto Date: Thu, 19 Apr 2018 16:49:51 +0000 Subject: [PATCH 10/11] applied patch to ARM --- src/crypto/CryptoNight_arm.h | 37 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 8abc8afd..b713eba2 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -479,7 +479,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si keccak(input, (int) size, ctx->state0, 200); - VARIANT1_INIT(0); + VARIANT1_INIT(0) cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); @@ -491,49 +491,47 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; - + void* mp = ((uint8_t*) l0) + ((idx0) & MASK); + for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - } - else { - cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); + } else { + cx = _mm_load_si128((__m128i *) mp); # ifndef XMRIG_ARMv7 cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); # endif } - _mm_store_si128((__m128i *) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); - VARIANT1_1(&l0[idx0 & MASK]); - idx0 = EXTRACT64(cx); + _mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx)); + VARIANT1_1(mp); + mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK); bx0 = cx; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + cl = ((uint64_t*) mp)[0]; + ch = ((uint64_t*) mp)[1]; lo = __umul128(idx0, cl, &hi); al0 += hi; ah0 += lo; VARIANT1_2(ah0, 0); - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) mp)[0] = al0; + ((uint64_t*) mp)[1] = ah0; VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; - idx0 = al0; + mp = ((uint8_t*) l0) + ((al0) & MASK); if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*)mp)[0]; + int32_t d = ((int32_t*)mp)[2]; int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*) mp)[0] = n ^ q; } } @@ -543,7 +541,6 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); } - template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx *__restrict__ ctx) { From 81edb492112ae543e4187fb40f81fc486195bd31 Mon Sep 17 00:00:00 2001 From: aegroto Date: Sun, 22 Apr 2018 10:34:20 +0000 Subject: [PATCH 11/11] arm patch fix --- src/crypto/CryptoNight_arm.h | 47 ++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 2e24d660..02c2479b 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -7,7 +7,6 @@ * Copyright 2016 Imran Yusuff * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett - * Copyright 2018 aegroto * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -264,10 +263,6 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) const __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i)); for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) { - if (!SOFT_AES) { - aes_round(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - } - aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -279,6 +274,7 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + _mm_store_si128(outputTmp, xin0); _mm_store_si128(outputTmp + 1, xin1); _mm_store_si128(outputTmp + 2, xin2); @@ -286,7 +282,7 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) _mm_store_si128(outputTmp + 4, xin4); _mm_store_si128(outputTmp + 5, xin5); _mm_store_si128(outputTmp + 6, xin6); - _mm_store_si128(outputTmp + 7, xin7); + _mm_store_si128(outputTmp + 7, xin7); } } @@ -396,9 +392,7 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) uint64_t vh = vgetq_lane_u64(tmp, 1); uint8_t x = vh >> 24; - static const uint16_t table = 0x7531; - const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; - vh ^= ((table >> index) & 0x3) << 28; + vh ^= ((0x7531 >> ((((x >> 3) & 6) | (x & 1)) << 1)) & 0x3) << 28; mem_out[1] = vh; } @@ -418,7 +412,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si keccak(input, (int) size, ctx[0]->state, 200); - VARIANT1_INIT(0) + VARIANT1_INIT(0); cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); @@ -430,24 +424,26 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; + void* mp = ((uint8_t*) l0) + (idx0 & MASK); - void* mp = ((uint8_t*) l0) + ((idx0) & MASK); - for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); - } else { + cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); + } + else { cx = _mm_load_si128((__m128i *) mp); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); -# endif + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); } - _mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx)); - VARIANT1_1(mp); - mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK); + if (VARIANT > 0) { + cryptonight_monero_tweak((uint64_t*) mp, _mm_xor_si128(bx0, cx)); + } else { + _mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx)); + } + + mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK); bx0 = cx; uint64_t hi, lo, cl, ch; @@ -465,13 +461,14 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si ah0 ^= ch; al0 ^= cl; - mp = ((uint8_t*) l0) + ((al0) & MASK); + mp = ((uint8_t*) l0) + (al0 & MASK); if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)mp)[0]; - int32_t d = ((int32_t*)mp)[2]; + int64_t n = ((int64_t*) mp)[0]; + int32_t d = ((int32_t*) mp)[2]; int64_t q = n / (d | 0x5); - ((int64_t*) mp)[0] = n ^ q; + + ((int64_t*) mp)[0] = n ^ q; } } @@ -481,6 +478,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } + template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, struct cryptonight_ctx **__restrict__ ctx) { @@ -627,3 +625,4 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz } #endif /* __CRYPTONIGHT_ARM_H__ */ +