diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 746de79c..02c2479b 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -260,7 +260,9 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) } } - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + const __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i)); + + for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) { aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -272,14 +274,15 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - _mm_store_si128(output + i + 0, xin0); - _mm_store_si128(output + i + 1, xin1); - _mm_store_si128(output + i + 2, xin2); - _mm_store_si128(output + i + 3, xin3); - _mm_store_si128(output + i + 4, xin4); - _mm_store_si128(output + i + 5, xin5); - _mm_store_si128(output + i + 6, xin6); - _mm_store_si128(output + i + 7, xin7); + + _mm_store_si128(outputTmp, xin0); + _mm_store_si128(outputTmp + 1, xin1); + _mm_store_si128(outputTmp + 2, xin2); + _mm_store_si128(outputTmp + 3, xin3); + _mm_store_si128(outputTmp + 4, xin4); + _mm_store_si128(outputTmp + 5, xin5); + _mm_store_si128(outputTmp + 6, xin6); + _mm_store_si128(outputTmp + 7, xin7); } } @@ -301,16 +304,18 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + const __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); + + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -329,15 +334,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -387,9 +392,7 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) uint64_t vh = vgetq_lane_u64(tmp, 1); uint8_t x = vh >> 24; - static const uint16_t table = 0x7531; - const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; - vh ^= ((table >> index) & 0x3) << 28; + vh ^= ((0x7531 >> ((((x >> 3) & 6) | (x & 1)) << 1)) & 0x3) << 28; mem_out[1] = vh; } @@ -421,51 +424,51 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = h0[0] ^ h0[4]; + void* mp = ((uint8_t*) l0) + (idx0 & MASK); for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); } else { - cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx = _mm_load_si128((__m128i *) mp); cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); } if (VARIANT > 0) { - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + cryptonight_monero_tweak((uint64_t*) mp, _mm_xor_si128(bx0, cx)); } else { - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + _mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx)); } - idx0 = EXTRACT64(cx); + mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK); bx0 = cx; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + cl = ((uint64_t*) mp)[0]; + ch = ((uint64_t*) mp)[1]; lo = __umul128(idx0, cl, &hi); al0 += hi; ah0 += lo; VARIANT1_2(ah0, 0); - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) mp)[0] = al0; + ((uint64_t*) mp)[1] = ah0; VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; - idx0 = al0; + mp = ((uint8_t*) l0) + (al0 & MASK); if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*) mp)[0]; + int32_t d = ((int32_t*) mp)[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*) mp)[0] = n ^ q; } } @@ -622,3 +625,4 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz } #endif /* __CRYPTONIGHT_ARM_H__ */ + diff --git a/src/crypto/CryptoNight_monero.h b/src/crypto/CryptoNight_monero.h index ea1622ab..06217d4a 100644 --- a/src/crypto/CryptoNight_monero.h +++ b/src/crypto/CryptoNight_monero.h @@ -46,10 +46,9 @@ #define VARIANT1_1(p) \ if (VARIANT > 0) { \ - const uint8_t tmp = reinterpret_cast(p)[11]; \ static const uint32_t table = 0x75310; \ - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ - ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \ + const uint8_t tmp = reinterpret_cast(p)[11]; \ + ((uint8_t*)(p))[11] = tmp ^ ((table >> ((((tmp >> 3) & 6) | (tmp & 1)) << 1)) & 0x30); \ } #define VARIANT1_2(p, part) \ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 66bcf8b5..9961ef8b 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -6,6 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett + * Copyright 2018 aegroto * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -122,7 +123,6 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin } #endif - // This will shift and xor tmp1 into itself as 4 32-bit vals such as // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) static inline __m128i sl_xor(__m128i tmp1) @@ -266,7 +266,9 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) } } - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + const __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i)); + + for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) { aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); @@ -278,14 +280,14 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output) aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); - _mm_store_si128(output + i + 0, xin0); - _mm_store_si128(output + i + 1, xin1); - _mm_store_si128(output + i + 2, xin2); - _mm_store_si128(output + i + 3, xin3); - _mm_store_si128(output + i + 4, xin4); - _mm_store_si128(output + i + 5, xin5); - _mm_store_si128(output + i + 6, xin6); - _mm_store_si128(output + i + 7, xin7); + _mm_store_si128(outputTmp, xin0); + _mm_store_si128(outputTmp + 1, xin1); + _mm_store_si128(outputTmp + 2, xin2); + _mm_store_si128(outputTmp + 3, xin3); + _mm_store_si128(outputTmp + 4, xin4); + _mm_store_si128(outputTmp + 5, xin5); + _mm_store_si128(outputTmp + 6, xin6); + _mm_store_si128(outputTmp + 7, xin7); } } @@ -307,16 +309,17 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) - { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + const __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i); + + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -335,15 +338,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output) } if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { - xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); - xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); - xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); - xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); - xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); - xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); - xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); - xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) { + xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7); aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); @@ -403,6 +406,7 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) } + template inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) { @@ -429,60 +433,60 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); uint64_t idx0 = al0; + void* mp = ((uint8_t*) l0) + (idx0 & MASK); for (size_t i = 0; i < ITERATIONS; i++) { __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0)); } else { - cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx = _mm_load_si128((__m128i *) mp); cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); } if (VARIANT > 0) { - cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + cryptonight_monero_tweak((uint64_t*) mp, _mm_xor_si128(bx0, cx)); } else { - _mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx)); + _mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx)); } - idx0 = EXTRACT64(cx); + mp = (void*) &l0[(idx0 = EXTRACT64(cx)) & MASK]; bx0 = cx; uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + cl = ((uint64_t*) mp)[0]; + ch = ((uint64_t*) mp)[1]; lo = __umul128(idx0, cl, &hi); al0 += hi; ah0 += lo; - ((uint64_t*)&l0[idx0 & MASK])[0] = al0; + ((uint64_t*) mp)[0] = al0; if (VARIANT > 0) { if (VARIANT == xmrig::VARIANT_IPBC) { - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0; + ((uint64_t*) mp)[1] = ah0 ^ tweak1_2_0 ^ al0; } else { - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0; + ((uint64_t*) mp)[1] = ah0 ^ tweak1_2_0; } } else { - ((uint64_t*)&l0[idx0 & MASK])[1] = ah0; + ((uint64_t*) mp)[1] = ah0; } al0 ^= cl; ah0 ^= ch; - idx0 = al0; + mp = (void*) &l0[al0 & MASK]; if (ALGO == xmrig::CRYPTONIGHT_HEAVY) { - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t n = ((int64_t*) mp)[0]; + int32_t d = ((int32_t*) mp)[2]; int64_t q = n / (d | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*) mp)[0] = n ^ q; } } @@ -492,7 +496,6 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } - template inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx) {