Merge cd131d4953
into b383a382eb
This commit is contained in:
commit
8178f238a8
3 changed files with 99 additions and 93 deletions
|
@ -260,7 +260,9 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
|
const __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i));
|
||||||
|
|
||||||
|
for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) {
|
||||||
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
|
@ -272,14 +274,15 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
|
|
||||||
_mm_store_si128(output + i + 0, xin0);
|
|
||||||
_mm_store_si128(output + i + 1, xin1);
|
_mm_store_si128(outputTmp, xin0);
|
||||||
_mm_store_si128(output + i + 2, xin2);
|
_mm_store_si128(outputTmp + 1, xin1);
|
||||||
_mm_store_si128(output + i + 3, xin3);
|
_mm_store_si128(outputTmp + 2, xin2);
|
||||||
_mm_store_si128(output + i + 4, xin4);
|
_mm_store_si128(outputTmp + 3, xin3);
|
||||||
_mm_store_si128(output + i + 5, xin5);
|
_mm_store_si128(outputTmp + 4, xin4);
|
||||||
_mm_store_si128(output + i + 6, xin6);
|
_mm_store_si128(outputTmp + 5, xin5);
|
||||||
_mm_store_si128(output + i + 7, xin7);
|
_mm_store_si128(outputTmp + 6, xin6);
|
||||||
|
_mm_store_si128(outputTmp + 7, xin7);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -301,16 +304,18 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
xout6 = _mm_load_si128(output + 10);
|
xout6 = _mm_load_si128(output + 10);
|
||||||
xout7 = _mm_load_si128(output + 11);
|
xout7 = _mm_load_si128(output + 11);
|
||||||
|
|
||||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
|
const __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i);
|
||||||
|
|
||||||
|
for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8)
|
||||||
{
|
{
|
||||||
xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
|
xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0);
|
||||||
xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
|
xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1);
|
||||||
xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
|
xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2);
|
||||||
xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
|
xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3);
|
||||||
xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
|
xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4);
|
||||||
xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
|
xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5);
|
||||||
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
|
xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6);
|
||||||
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
|
xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7);
|
||||||
|
|
||||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
|
@ -329,15 +334,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
|
for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) {
|
||||||
xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
|
xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0);
|
||||||
xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
|
xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1);
|
||||||
xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
|
xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2);
|
||||||
xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
|
xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3);
|
||||||
xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
|
xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4);
|
||||||
xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
|
xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5);
|
||||||
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
|
xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6);
|
||||||
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
|
xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7);
|
||||||
|
|
||||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
|
@ -387,9 +392,7 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
|
||||||
uint64_t vh = vgetq_lane_u64(tmp, 1);
|
uint64_t vh = vgetq_lane_u64(tmp, 1);
|
||||||
|
|
||||||
uint8_t x = vh >> 24;
|
uint8_t x = vh >> 24;
|
||||||
static const uint16_t table = 0x7531;
|
vh ^= ((0x7531 >> ((((x >> 3) & 6) | (x & 1)) << 1)) & 0x3) << 28;
|
||||||
const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1;
|
|
||||||
vh ^= ((table >> index) & 0x3) << 28;
|
|
||||||
|
|
||||||
mem_out[1] = vh;
|
mem_out[1] = vh;
|
||||||
}
|
}
|
||||||
|
@ -421,51 +424,51 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
|
||||||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||||
|
|
||||||
uint64_t idx0 = h0[0] ^ h0[4];
|
uint64_t idx0 = h0[0] ^ h0[4];
|
||||||
|
void* mp = ((uint8_t*) l0) + (idx0 & MASK);
|
||||||
|
|
||||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||||
__m128i cx;
|
__m128i cx;
|
||||||
|
|
||||||
if (SOFT_AES) {
|
if (SOFT_AES) {
|
||||||
cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
|
cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
|
cx = _mm_load_si128((__m128i *) mp);
|
||||||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
|
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (VARIANT > 0) {
|
if (VARIANT > 0) {
|
||||||
cryptonight_monero_tweak((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
|
cryptonight_monero_tweak((uint64_t*) mp, _mm_xor_si128(bx0, cx));
|
||||||
} else {
|
} else {
|
||||||
_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
|
_mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx));
|
||||||
}
|
}
|
||||||
|
|
||||||
idx0 = EXTRACT64(cx);
|
mp = ((uint8_t*) l0) + ((idx0 = EXTRACT64(cx)) & MASK);
|
||||||
bx0 = cx;
|
bx0 = cx;
|
||||||
|
|
||||||
uint64_t hi, lo, cl, ch;
|
uint64_t hi, lo, cl, ch;
|
||||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
cl = ((uint64_t*) mp)[0];
|
||||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
ch = ((uint64_t*) mp)[1];
|
||||||
lo = __umul128(idx0, cl, &hi);
|
lo = __umul128(idx0, cl, &hi);
|
||||||
|
|
||||||
al0 += hi;
|
al0 += hi;
|
||||||
ah0 += lo;
|
ah0 += lo;
|
||||||
|
|
||||||
VARIANT1_2(ah0, 0);
|
VARIANT1_2(ah0, 0);
|
||||||
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
|
((uint64_t*) mp)[0] = al0;
|
||||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
|
((uint64_t*) mp)[1] = ah0;
|
||||||
VARIANT1_2(ah0, 0);
|
VARIANT1_2(ah0, 0);
|
||||||
|
|
||||||
ah0 ^= ch;
|
ah0 ^= ch;
|
||||||
al0 ^= cl;
|
al0 ^= cl;
|
||||||
idx0 = al0;
|
mp = ((uint8_t*) l0) + (al0 & MASK);
|
||||||
|
|
||||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||||
int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
|
int64_t n = ((int64_t*) mp)[0];
|
||||||
int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
|
int32_t d = ((int32_t*) mp)[2];
|
||||||
int64_t q = n / (d | 0x5);
|
int64_t q = n / (d | 0x5);
|
||||||
|
|
||||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
((int64_t*) mp)[0] = n ^ q;
|
||||||
idx0 = d ^ q;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -622,3 +625,4 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* __CRYPTONIGHT_ARM_H__ */
|
#endif /* __CRYPTONIGHT_ARM_H__ */
|
||||||
|
|
||||||
|
|
|
@ -46,10 +46,9 @@
|
||||||
|
|
||||||
#define VARIANT1_1(p) \
|
#define VARIANT1_1(p) \
|
||||||
if (VARIANT > 0) { \
|
if (VARIANT > 0) { \
|
||||||
const uint8_t tmp = reinterpret_cast<const uint8_t*>(p)[11]; \
|
|
||||||
static const uint32_t table = 0x75310; \
|
static const uint32_t table = 0x75310; \
|
||||||
const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \
|
const uint8_t tmp = reinterpret_cast<const uint8_t*>(p)[11]; \
|
||||||
((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \
|
((uint8_t*)(p))[11] = tmp ^ ((table >> ((((tmp >> 3) & 6) | (tmp & 1)) << 1)) & 0x30); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define VARIANT1_2(p, part) \
|
#define VARIANT1_2(p, part) \
|
||||||
|
|
|
@ -6,6 +6,7 @@
|
||||||
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
* Copyright 2016 Jay D Dee <jayddee246@gmail.com>
|
||||||
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
* Copyright 2017-2018 XMR-Stak <https://github.com/fireice-uk>, <https://github.com/psychocrypt>
|
||||||
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
* Copyright 2018 Lee Clagett <https://github.com/vtnerd>
|
||||||
|
* Copyright 2018 aegroto <https://github.com/aegroto>
|
||||||
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
* Copyright 2016-2018 XMRig <https://github.com/xmrig>, <support@xmrig.com>
|
||||||
*
|
*
|
||||||
* This program is free software: you can redistribute it and/or modify
|
* This program is free software: you can redistribute it and/or modify
|
||||||
|
@ -122,7 +123,6 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
// This will shift and xor tmp1 into itself as 4 32-bit vals such as
|
// This will shift and xor tmp1 into itself as 4 32-bit vals such as
|
||||||
// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
|
// sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
|
||||||
static inline __m128i sl_xor(__m128i tmp1)
|
static inline __m128i sl_xor(__m128i tmp1)
|
||||||
|
@ -266,7 +266,9 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
|
const __m128i *outputTmpLimit = output + (MEM / sizeof(__m128i));
|
||||||
|
|
||||||
|
for (__m128i *outputTmp = output; outputTmp < outputTmpLimit; outputTmp += 8) {
|
||||||
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
|
@ -278,14 +280,14 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||||
|
|
||||||
_mm_store_si128(output + i + 0, xin0);
|
_mm_store_si128(outputTmp, xin0);
|
||||||
_mm_store_si128(output + i + 1, xin1);
|
_mm_store_si128(outputTmp + 1, xin1);
|
||||||
_mm_store_si128(output + i + 2, xin2);
|
_mm_store_si128(outputTmp + 2, xin2);
|
||||||
_mm_store_si128(output + i + 3, xin3);
|
_mm_store_si128(outputTmp + 3, xin3);
|
||||||
_mm_store_si128(output + i + 4, xin4);
|
_mm_store_si128(outputTmp + 4, xin4);
|
||||||
_mm_store_si128(output + i + 5, xin5);
|
_mm_store_si128(outputTmp + 5, xin5);
|
||||||
_mm_store_si128(output + i + 6, xin6);
|
_mm_store_si128(outputTmp + 6, xin6);
|
||||||
_mm_store_si128(output + i + 7, xin7);
|
_mm_store_si128(outputTmp + 7, xin7);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -307,16 +309,17 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
xout6 = _mm_load_si128(output + 10);
|
xout6 = _mm_load_si128(output + 10);
|
||||||
xout7 = _mm_load_si128(output + 11);
|
xout7 = _mm_load_si128(output + 11);
|
||||||
|
|
||||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
|
const __m128i *inputTmpLimit = (__m128i*) input + MEM / sizeof(__m128i);
|
||||||
{
|
|
||||||
xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
|
for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) {
|
||||||
xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
|
xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0);
|
||||||
xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
|
xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1);
|
||||||
xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
|
xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2);
|
||||||
xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
|
xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3);
|
||||||
xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
|
xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4);
|
||||||
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
|
xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5);
|
||||||
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
|
xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6);
|
||||||
|
xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7);
|
||||||
|
|
||||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
|
@ -335,15 +338,15 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||||
for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
|
for (__m128i *inputTmp = (__m128i*) input; inputTmp < inputTmpLimit; inputTmp += 8) {
|
||||||
xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
|
xout0 = _mm_xor_si128(_mm_load_si128(inputTmp), xout0);
|
||||||
xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
|
xout1 = _mm_xor_si128(_mm_load_si128(inputTmp + 1), xout1);
|
||||||
xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
|
xout2 = _mm_xor_si128(_mm_load_si128(inputTmp + 2), xout2);
|
||||||
xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3);
|
xout3 = _mm_xor_si128(_mm_load_si128(inputTmp + 3), xout3);
|
||||||
xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4);
|
xout4 = _mm_xor_si128(_mm_load_si128(inputTmp + 4), xout4);
|
||||||
xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5);
|
xout5 = _mm_xor_si128(_mm_load_si128(inputTmp + 5), xout5);
|
||||||
xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6);
|
xout6 = _mm_xor_si128(_mm_load_si128(inputTmp + 6), xout6);
|
||||||
xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7);
|
xout7 = _mm_xor_si128(_mm_load_si128(inputTmp + 7), xout7);
|
||||||
|
|
||||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||||
|
@ -403,6 +406,7 @@ static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
template<xmrig::Algo ALGO, bool SOFT_AES, int VARIANT>
|
template<xmrig::Algo ALGO, bool SOFT_AES, int VARIANT>
|
||||||
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
|
inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
|
||||||
{
|
{
|
||||||
|
@ -429,60 +433,60 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
|
||||||
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
__m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||||
|
|
||||||
uint64_t idx0 = al0;
|
uint64_t idx0 = al0;
|
||||||
|
void* mp = ((uint8_t*) l0) + (idx0 & MASK);
|
||||||
|
|
||||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||||
__m128i cx;
|
__m128i cx;
|
||||||
|
|
||||||
if (SOFT_AES) {
|
if (SOFT_AES) {
|
||||||
cx = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
|
cx = soft_aesenc((uint32_t*) mp, _mm_set_epi64x(ah0, al0));
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
cx = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
|
cx = _mm_load_si128((__m128i *) mp);
|
||||||
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
|
cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (VARIANT > 0) {
|
if (VARIANT > 0) {
|
||||||
cryptonight_monero_tweak<VARIANT == xmrig::VARIANT_XTL ? 4 : 3>((uint64_t*)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
|
cryptonight_monero_tweak<VARIANT == xmrig::VARIANT_XTL ? 4 : 3>((uint64_t*) mp, _mm_xor_si128(bx0, cx));
|
||||||
} else {
|
} else {
|
||||||
_mm_store_si128((__m128i *)&l0[idx0 & MASK], _mm_xor_si128(bx0, cx));
|
_mm_store_si128((__m128i *) mp, _mm_xor_si128(bx0, cx));
|
||||||
}
|
}
|
||||||
|
|
||||||
idx0 = EXTRACT64(cx);
|
mp = (void*) &l0[(idx0 = EXTRACT64(cx)) & MASK];
|
||||||
bx0 = cx;
|
bx0 = cx;
|
||||||
|
|
||||||
uint64_t hi, lo, cl, ch;
|
uint64_t hi, lo, cl, ch;
|
||||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
cl = ((uint64_t*) mp)[0];
|
||||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
ch = ((uint64_t*) mp)[1];
|
||||||
lo = __umul128(idx0, cl, &hi);
|
lo = __umul128(idx0, cl, &hi);
|
||||||
|
|
||||||
al0 += hi;
|
al0 += hi;
|
||||||
ah0 += lo;
|
ah0 += lo;
|
||||||
|
|
||||||
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
|
((uint64_t*) mp)[0] = al0;
|
||||||
|
|
||||||
if (VARIANT > 0) {
|
if (VARIANT > 0) {
|
||||||
if (VARIANT == xmrig::VARIANT_IPBC) {
|
if (VARIANT == xmrig::VARIANT_IPBC) {
|
||||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0 ^ al0;
|
((uint64_t*) mp)[1] = ah0 ^ tweak1_2_0 ^ al0;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
|
((uint64_t*) mp)[1] = ah0 ^ tweak1_2_0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0;
|
((uint64_t*) mp)[1] = ah0;
|
||||||
}
|
}
|
||||||
|
|
||||||
al0 ^= cl;
|
al0 ^= cl;
|
||||||
ah0 ^= ch;
|
ah0 ^= ch;
|
||||||
idx0 = al0;
|
mp = (void*) &l0[al0 & MASK];
|
||||||
|
|
||||||
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
if (ALGO == xmrig::CRYPTONIGHT_HEAVY) {
|
||||||
int64_t n = ((int64_t*)&l0[idx0 & MASK])[0];
|
int64_t n = ((int64_t*) mp)[0];
|
||||||
int32_t d = ((int32_t*)&l0[idx0 & MASK])[2];
|
int32_t d = ((int32_t*) mp)[2];
|
||||||
int64_t q = n / (d | 0x5);
|
int64_t q = n / (d | 0x5);
|
||||||
|
|
||||||
((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
|
((int64_t*) mp)[0] = n ^ q;
|
||||||
idx0 = d ^ q;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -492,7 +496,6 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
|
||||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
template<xmrig::Algo ALGO, bool SOFT_AES, int VARIANT>
|
template<xmrig::Algo ALGO, bool SOFT_AES, int VARIANT>
|
||||||
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
|
inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx)
|
||||||
{
|
{
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue