Merge xmrig v6.16.0 into master
This commit is contained in:
commit
9afe95e454
84 changed files with 72411 additions and 233 deletions
|
@ -285,23 +285,41 @@ inline constexpr uint64_t interleaved_index<0>(uint64_t k)
|
|||
|
||||
|
||||
template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
|
||||
static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
||||
static NOINLINE void cn_explode_scratchpad(cryptonight_ctx *ctx)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
|
||||
|
||||
__m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7;
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
||||
const __m128i* input = reinterpret_cast<const __m128i*>(ctx->state);
|
||||
__m128i* output = reinterpret_cast<__m128i*>(ctx->memory);
|
||||
|
||||
aes_genkey<SOFT_AES>(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
|
||||
|
||||
xin0 = _mm_load_si128(input + 4);
|
||||
xin1 = _mm_load_si128(input + 5);
|
||||
xin2 = _mm_load_si128(input + 6);
|
||||
xin3 = _mm_load_si128(input + 7);
|
||||
xin4 = _mm_load_si128(input + 8);
|
||||
xin5 = _mm_load_si128(input + 9);
|
||||
xin6 = _mm_load_si128(input + 10);
|
||||
xin7 = _mm_load_si128(input + 11);
|
||||
if (props.half_mem() && !ctx->first_half) {
|
||||
const __m128i* p = reinterpret_cast<const __m128i*>(ctx->save_state);
|
||||
xin0 = _mm_load_si128(p + 0);
|
||||
xin1 = _mm_load_si128(p + 1);
|
||||
xin2 = _mm_load_si128(p + 2);
|
||||
xin3 = _mm_load_si128(p + 3);
|
||||
xin4 = _mm_load_si128(p + 4);
|
||||
xin5 = _mm_load_si128(p + 5);
|
||||
xin6 = _mm_load_si128(p + 6);
|
||||
xin7 = _mm_load_si128(p + 7);
|
||||
}
|
||||
else {
|
||||
xin0 = _mm_load_si128(input + 4);
|
||||
xin1 = _mm_load_si128(input + 5);
|
||||
xin2 = _mm_load_si128(input + 6);
|
||||
xin3 = _mm_load_si128(input + 7);
|
||||
xin4 = _mm_load_si128(input + 8);
|
||||
xin5 = _mm_load_si128(input + 9);
|
||||
xin6 = _mm_load_si128(input + 10);
|
||||
xin7 = _mm_load_si128(input + 11);
|
||||
}
|
||||
|
||||
if (props.isHeavy()) {
|
||||
for (size_t i = 0; i < 16; i++) {
|
||||
|
@ -320,42 +338,61 @@ static inline void cn_explode_scratchpad(const __m128i *input, __m128i *output)
|
|||
}
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < props.memory() / sizeof(__m128i); i += 8) {
|
||||
if (interleave > 0) {
|
||||
_mm_prefetch((const char*)(output), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(output + (64 << interleave) / sizeof(__m128i)), _MM_HINT_T0);
|
||||
}
|
||||
constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
|
||||
constexpr int prefetch_dist = 2048 / sizeof(__m128i);
|
||||
|
||||
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
__m128i* e = output + N - prefetch_dist;
|
||||
__m128i* prefetch_ptr = output + prefetch_dist;
|
||||
|
||||
_mm_store_si128(output + 0, xin0);
|
||||
_mm_store_si128(output + 1, xin1);
|
||||
_mm_store_si128(output + 2, xin2);
|
||||
_mm_store_si128(output + 3, xin3);
|
||||
for (int i = 0; i < 2; ++i) {
|
||||
do {
|
||||
_mm_prefetch((const char*)(prefetch_ptr), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(prefetch_ptr + output_increment), _MM_HINT_T0);
|
||||
|
||||
constexpr int output_increment = (64 << interleave) / sizeof(__m128i);
|
||||
aes_round<SOFT_AES>(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
|
||||
|
||||
_mm_store_si128(output + output_increment + 0, xin4);
|
||||
_mm_store_si128(output + output_increment + 1, xin5);
|
||||
_mm_store_si128(output + output_increment + 2, xin6);
|
||||
_mm_store_si128(output + output_increment + 3, xin7);
|
||||
_mm_store_si128(output + 0, xin0);
|
||||
_mm_store_si128(output + 1, xin1);
|
||||
_mm_store_si128(output + 2, xin2);
|
||||
_mm_store_si128(output + 3, xin3);
|
||||
|
||||
output += output_increment * 2;
|
||||
_mm_store_si128(output + output_increment + 0, xin4);
|
||||
_mm_store_si128(output + output_increment + 1, xin5);
|
||||
_mm_store_si128(output + output_increment + 2, xin6);
|
||||
_mm_store_si128(output + output_increment + 3, xin7);
|
||||
|
||||
output += output_increment * 2;
|
||||
prefetch_ptr += output_increment * 2;
|
||||
} while (output < e);
|
||||
e += prefetch_dist;
|
||||
prefetch_ptr = output;
|
||||
}
|
||||
|
||||
if (props.half_mem() && ctx->first_half) {
|
||||
__m128i* p = reinterpret_cast<__m128i*>(ctx->save_state);
|
||||
_mm_store_si128(p + 0, xin0);
|
||||
_mm_store_si128(p + 1, xin1);
|
||||
_mm_store_si128(p + 2, xin2);
|
||||
_mm_store_si128(p + 3, xin3);
|
||||
_mm_store_si128(p + 4, xin4);
|
||||
_mm_store_si128(p + 5, xin5);
|
||||
_mm_store_si128(p + 6, xin6);
|
||||
_mm_store_si128(p + 7, xin7);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO, bool SOFT_AES, int interleave>
|
||||
static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
||||
static NOINLINE void cn_implode_scratchpad(cryptonight_ctx *ctx)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
|
||||
|
@ -365,9 +402,14 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
|||
constexpr bool IS_HEAVY = props.isHeavy();
|
||||
# endif
|
||||
|
||||
constexpr size_t N = (props.memory() / sizeof(__m128i)) / (props.half_mem() ? 2 : 1);
|
||||
|
||||
__m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7;
|
||||
__m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9;
|
||||
|
||||
const __m128i *input = reinterpret_cast<const __m128i*>(ctx->memory);
|
||||
__m128i *output = reinterpret_cast<__m128i*>(ctx->state);
|
||||
|
||||
aes_genkey<SOFT_AES>(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9);
|
||||
|
||||
xout0 = _mm_load_si128(output + 4);
|
||||
|
@ -380,46 +422,54 @@ static inline void cn_implode_scratchpad(const __m128i *input, __m128i *output)
|
|||
xout7 = _mm_load_si128(output + 11);
|
||||
|
||||
const __m128i* input_begin = input;
|
||||
for (size_t i = 0; i < props.memory() / sizeof(__m128i);) {
|
||||
xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
|
||||
xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
|
||||
xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
|
||||
xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
|
||||
|
||||
constexpr int input_increment = (64 << interleave) / sizeof(__m128i);
|
||||
|
||||
xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4);
|
||||
xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5);
|
||||
xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6);
|
||||
xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7);
|
||||
|
||||
input += input_increment * 2;
|
||||
i += 8;
|
||||
|
||||
if ((interleave > 0) && (i < props.memory() / sizeof(__m128i))) {
|
||||
_mm_prefetch((const char*)(input), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(input + (64 << interleave) / sizeof(__m128i)), _MM_HINT_T0);
|
||||
for (size_t part = 0; part < (props.half_mem() ? 2 : 1); ++part) {
|
||||
if (props.half_mem() && (part == 1)) {
|
||||
input = input_begin;
|
||||
ctx->first_half = false;
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, interleave>(ctx);
|
||||
}
|
||||
|
||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
for (size_t i = 0; i < N;) {
|
||||
xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
|
||||
xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
|
||||
xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
|
||||
xout3 = _mm_xor_si128(_mm_load_si128(input + 3), xout3);
|
||||
|
||||
if (IS_HEAVY) {
|
||||
mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
|
||||
constexpr int input_increment = (64 << interleave) / sizeof(__m128i);
|
||||
|
||||
xout4 = _mm_xor_si128(_mm_load_si128(input + input_increment + 0), xout4);
|
||||
xout5 = _mm_xor_si128(_mm_load_si128(input + input_increment + 1), xout5);
|
||||
xout6 = _mm_xor_si128(_mm_load_si128(input + input_increment + 2), xout6);
|
||||
xout7 = _mm_xor_si128(_mm_load_si128(input + input_increment + 3), xout7);
|
||||
|
||||
input += input_increment * 2;
|
||||
i += 8;
|
||||
|
||||
if (i < N) {
|
||||
_mm_prefetch((const char*)(input), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(input + input_increment), _MM_HINT_T0);
|
||||
}
|
||||
|
||||
aes_round<SOFT_AES>(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
|
||||
|
||||
if (IS_HEAVY) {
|
||||
mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (IS_HEAVY) {
|
||||
input = input_begin;
|
||||
for (size_t i = 0; i < props.memory() / sizeof(__m128i);) {
|
||||
for (size_t i = 0; i < N;) {
|
||||
xout0 = _mm_xor_si128(_mm_load_si128(input + 0), xout0);
|
||||
xout1 = _mm_xor_si128(_mm_load_si128(input + 1), xout1);
|
||||
xout2 = _mm_xor_si128(_mm_load_si128(input + 2), xout2);
|
||||
|
@ -529,6 +579,9 @@ static inline __m128i int_sqrt_v2(const uint64_t n0)
|
|||
void v4_soft_aes_compile_code(const V4_Instruction *code, int code_size, void *machine_code, xmrig::Assembly ASM);
|
||||
|
||||
|
||||
alignas(64) static const uint32_t tweak1_table[256] = { 268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,268435456,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,805306368,0,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456,805306368,268435456 };
|
||||
|
||||
|
||||
namespace xmrig {
|
||||
|
||||
|
||||
|
@ -547,12 +600,7 @@ static inline void cryptonight_monero_tweak(uint64_t *mem_out, const uint8_t *l,
|
|||
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
||||
uint64_t vh = _mm_cvtsi128_si64(tmp);
|
||||
|
||||
uint8_t x = static_cast<uint8_t>(vh >> 24);
|
||||
static const uint16_t table = 0x7531;
|
||||
const uint8_t index = (((x >> (3)) & 6) | (x & 1)) << 1;
|
||||
vh ^= ((table >> index) & 0x3) << 28;
|
||||
|
||||
mem_out[1] = vh;
|
||||
mem_out[1] = vh ^ tweak1_table[static_cast<uint32_t>(vh) >> 24];
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -593,7 +641,11 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
|
|||
}
|
||||
|
||||
keccak(input, size, ctx[0]->state);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, interleave>(reinterpret_cast<const __m128i *>(ctx[0]->state), reinterpret_cast<__m128i *>(ctx[0]->memory));
|
||||
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, interleave>(ctx[0]);
|
||||
|
||||
uint64_t *h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
|
||||
uint8_t *l0 = ctx[0]->memory;
|
||||
|
@ -748,7 +800,7 @@ inline void cryptonight_single_hash(const uint8_t *__restrict__ input, size_t si
|
|||
}
|
||||
# endif
|
||||
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, interleave>(reinterpret_cast<const __m128i *>(ctx[0]->memory), reinterpret_cast<__m128i *>(ctx[0]->state));
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, interleave>(ctx[0]);
|
||||
keccakf(h0, 24);
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
}
|
||||
|
@ -906,7 +958,11 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
|
|||
}
|
||||
|
||||
keccak(input, size, ctx[0]->state);
|
||||
cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
|
||||
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
|
||||
if (ALGO == Algorithm::CN_2) {
|
||||
if (ASM == Assembly::INTEL) {
|
||||
|
@ -988,7 +1044,7 @@ inline void cryptonight_single_hash_asm(const uint8_t *__restrict__ input, size_
|
|||
ctx[0]->generated_code(ctx);
|
||||
}
|
||||
|
||||
cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
}
|
||||
|
@ -1010,8 +1066,12 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
|
|||
keccak(input, size, ctx[0]->state);
|
||||
keccak(input + size, size, ctx[1]->state);
|
||||
|
||||
cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->state), reinterpret_cast<__m128i*>(ctx[0]->memory));
|
||||
cn_explode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[1]->state), reinterpret_cast<__m128i*>(ctx[1]->memory));
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
ctx[1]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
cn_explode_scratchpad<ALGO, false, 0>(ctx[1]);
|
||||
|
||||
if (ALGO == Algorithm::CN_2) {
|
||||
cnv2_double_mainloop_sandybridge_asm(ctx);
|
||||
|
@ -1050,8 +1110,8 @@ inline void cryptonight_double_hash_asm(const uint8_t *__restrict__ input, size_
|
|||
ctx[0]->generated_code(ctx);
|
||||
}
|
||||
|
||||
cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[0]->memory), reinterpret_cast<__m128i*>(ctx[0]->state));
|
||||
cn_implode_scratchpad<ALGO, false, 0>(reinterpret_cast<const __m128i*>(ctx[1]->memory), reinterpret_cast<__m128i*>(ctx[1]->state));
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[0]);
|
||||
cn_implode_scratchpad<ALGO, false, 0>(ctx[1]);
|
||||
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[0]->state), 24);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[1]->state), 24);
|
||||
|
@ -1102,8 +1162,12 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
|
|||
VARIANT4_RANDOM_MATH_INIT(0);
|
||||
VARIANT4_RANDOM_MATH_INIT(1);
|
||||
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(h0), reinterpret_cast<__m128i *>(l0));
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(h1), reinterpret_cast<__m128i *>(l1));
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
ctx[1]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
|
||||
|
||||
uint64_t al0 = h0[0] ^ h0[4];
|
||||
uint64_t al1 = h1[0] ^ h1[4];
|
||||
|
@ -1298,8 +1362,8 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
|
|||
bx10 = cx1;
|
||||
}
|
||||
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(l0), reinterpret_cast<__m128i *>(h0));
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i *>(l1), reinterpret_cast<__m128i *>(h1));
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
|
||||
|
||||
keccakf(h0, 24);
|
||||
keccakf(h1, 24);
|
||||
|
@ -1309,6 +1373,198 @@ inline void cryptonight_double_hash(const uint8_t *__restrict__ input, size_t si
|
|||
}
|
||||
|
||||
|
||||
static inline void cryptonight_monero_tweak_gr(uint64_t* mem_out, const uint8_t* l, uint64_t idx, __m128i ax0, __m128i bx0, __m128i cx)
|
||||
{
|
||||
__m128i tmp = _mm_xor_si128(bx0, cx);
|
||||
mem_out[0] = _mm_cvtsi128_si64(tmp);
|
||||
|
||||
tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp)));
|
||||
uint64_t vh = _mm_cvtsi128_si64(tmp);
|
||||
|
||||
mem_out[1] = vh ^ tweak1_table[static_cast<uint32_t>(vh) >> 24];
|
||||
}
|
||||
|
||||
|
||||
template<Algorithm::Id ALGO, bool SOFT_AES>
|
||||
void cryptonight_quad_hash_zen(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, cryptonight_ctx** __restrict__ ctx, uint64_t height)
|
||||
{
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
constexpr size_t MASK = props.mask();
|
||||
constexpr Algorithm::Id BASE = props.base();
|
||||
|
||||
if (BASE == Algorithm::CN_1 && size < 43) {
|
||||
memset(output, 0, 64);
|
||||
return;
|
||||
}
|
||||
|
||||
keccak(input + size * 0, size, ctx[0]->state);
|
||||
keccak(input + size * 1, size, ctx[1]->state);
|
||||
keccak(input + size * 2, size, ctx[2]->state);
|
||||
keccak(input + size * 3, size, ctx[3]->state);
|
||||
|
||||
uint8_t* l0 = ctx[0]->memory;
|
||||
uint8_t* l1 = ctx[1]->memory;
|
||||
uint8_t* l2 = ctx[2]->memory;
|
||||
uint8_t* l3 = ctx[3]->memory;
|
||||
|
||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx[0]->state);
|
||||
uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx[1]->state);
|
||||
uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx[2]->state);
|
||||
uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx[3]->state);
|
||||
|
||||
VARIANT1_INIT(0);
|
||||
VARIANT1_INIT(1);
|
||||
VARIANT1_INIT(2);
|
||||
VARIANT1_INIT(3);
|
||||
|
||||
if (props.half_mem()) {
|
||||
ctx[0]->first_half = true;
|
||||
ctx[1]->first_half = true;
|
||||
ctx[2]->first_half = true;
|
||||
ctx[3]->first_half = true;
|
||||
}
|
||||
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
|
||||
|
||||
uint64_t al0 = h0[0] ^ h0[4];
|
||||
uint64_t al1 = h1[0] ^ h1[4];
|
||||
uint64_t al2 = h2[0] ^ h2[4];
|
||||
uint64_t al3 = h3[0] ^ h3[4];
|
||||
|
||||
uint64_t ah0 = h0[1] ^ h0[5];
|
||||
uint64_t ah1 = h1[1] ^ h1[5];
|
||||
uint64_t ah2 = h2[1] ^ h2[5];
|
||||
uint64_t ah3 = h3[1] ^ h3[5];
|
||||
|
||||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
__m128i bx20 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
|
||||
__m128i bx30 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
|
||||
|
||||
uint64_t idx0 = al0;
|
||||
uint64_t idx1 = al1;
|
||||
uint64_t idx2 = al2;
|
||||
uint64_t idx3 = al3;
|
||||
|
||||
__m128i cx0, cx1, cx2, cx3;
|
||||
|
||||
if (!SOFT_AES) {
|
||||
cx0 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l0[idx0 & MASK]));
|
||||
cx1 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l1[idx1 & MASK]));
|
||||
cx2 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l2[idx2 & MASK]));
|
||||
cx3 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l3[idx3 & MASK]));
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < props.iterations(); i++) {
|
||||
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
|
||||
const __m128i ax1 = _mm_set_epi64x(ah1, al1);
|
||||
const __m128i ax2 = _mm_set_epi64x(ah2, al2);
|
||||
const __m128i ax3 = _mm_set_epi64x(ah3, al3);
|
||||
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc(&l0[idx0 & MASK], ax0, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
cx1 = soft_aesenc(&l1[idx1 & MASK], ax1, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
cx2 = soft_aesenc(&l2[idx2 & MASK], ax2, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
cx3 = soft_aesenc(&l3[idx3 & MASK], ax3, reinterpret_cast<const uint32_t*>(saes_table));
|
||||
}
|
||||
else {
|
||||
cx0 = _mm_aesenc_si128(cx0, ax0);
|
||||
cx1 = _mm_aesenc_si128(cx1, ax1);
|
||||
cx2 = _mm_aesenc_si128(cx2, ax2);
|
||||
cx3 = _mm_aesenc_si128(cx3, ax3);
|
||||
if (MASK > 131072) {
|
||||
_mm_prefetch((const char*)(&l0[_mm_cvtsi128_si32(cx0) & MASK]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&l1[_mm_cvtsi128_si32(cx1) & MASK]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&l2[_mm_cvtsi128_si32(cx2) & MASK]), _MM_HINT_T0);
|
||||
_mm_prefetch((const char*)(&l3[_mm_cvtsi128_si32(cx3) & MASK]), _MM_HINT_T0);
|
||||
}
|
||||
}
|
||||
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l0[idx0 & MASK], l0, idx0 & MASK, ax0, bx00, cx0);
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l1[idx1 & MASK], l1, idx1 & MASK, ax1, bx10, cx1);
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l2[idx2 & MASK], l2, idx2 & MASK, ax2, bx20, cx2);
|
||||
cryptonight_monero_tweak_gr((uint64_t*)&l3[idx3 & MASK], l3, idx3 & MASK, ax3, bx30, cx3);
|
||||
|
||||
idx0 = _mm_cvtsi128_si64(cx0);
|
||||
idx1 = _mm_cvtsi128_si64(cx1);
|
||||
idx2 = _mm_cvtsi128_si64(cx2);
|
||||
idx3 = _mm_cvtsi128_si64(cx3);
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
|
||||
cl = ((uint64_t*)&l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*)&l0[idx0 & MASK])[1];
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
((uint64_t*)&l0[idx0 & MASK])[0] = al0;
|
||||
((uint64_t*)&l0[idx0 & MASK])[1] = ah0 ^ tweak1_2_0;
|
||||
al0 ^= cl;
|
||||
ah0 ^= ch;
|
||||
idx0 = al0;
|
||||
bx00 = cx0;
|
||||
if (!SOFT_AES) cx0 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l0[idx0 & MASK]));
|
||||
|
||||
cl = ((uint64_t*)&l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*)&l1[idx1 & MASK])[1];
|
||||
lo = __umul128(idx1, cl, &hi);
|
||||
al1 += hi;
|
||||
ah1 += lo;
|
||||
((uint64_t*)&l1[idx1 & MASK])[0] = al1;
|
||||
((uint64_t*)&l1[idx1 & MASK])[1] = ah1 ^ tweak1_2_1;
|
||||
al1 ^= cl;
|
||||
ah1 ^= ch;
|
||||
idx1 = al1;
|
||||
bx10 = cx1;
|
||||
if (!SOFT_AES) cx1 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l1[idx1 & MASK]));
|
||||
|
||||
cl = ((uint64_t*)&l2[idx2 & MASK])[0];
|
||||
ch = ((uint64_t*)&l2[idx2 & MASK])[1];
|
||||
lo = __umul128(idx2, cl, &hi);
|
||||
al2 += hi;
|
||||
ah2 += lo;
|
||||
((uint64_t*)&l2[idx2 & MASK])[0] = al2;
|
||||
((uint64_t*)&l2[idx2 & MASK])[1] = ah2 ^ tweak1_2_2;
|
||||
al2 ^= cl;
|
||||
ah2 ^= ch;
|
||||
idx2 = al2;
|
||||
bx20 = cx2;
|
||||
if (!SOFT_AES) cx2 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l2[idx2 & MASK]));
|
||||
|
||||
cl = ((uint64_t*)&l3[idx3 & MASK])[0];
|
||||
ch = ((uint64_t*)&l3[idx3 & MASK])[1];
|
||||
lo = __umul128(idx3, cl, &hi);
|
||||
al3 += hi;
|
||||
ah3 += lo;
|
||||
((uint64_t*)&l3[idx3 & MASK])[0] = al3;
|
||||
((uint64_t*)&l3[idx3 & MASK])[1] = ah3 ^ tweak1_2_3;
|
||||
al3 ^= cl;
|
||||
ah3 ^= ch;
|
||||
idx3 = al3;
|
||||
bx30 = cx3;
|
||||
if (!SOFT_AES) cx3 = _mm_load_si128(reinterpret_cast<const __m128i*>(&l3[idx3 & MASK]));
|
||||
}
|
||||
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[0]);
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[1]);
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[2]);
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[3]);
|
||||
|
||||
keccakf(h0, 24);
|
||||
keccakf(h1, 24);
|
||||
keccakf(h2, 24);
|
||||
keccakf(h3, 24);
|
||||
|
||||
extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output);
|
||||
extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32);
|
||||
extra_hashes[ctx[2]->state[0] & 3](ctx[2]->state, 200, output + 64);
|
||||
extra_hashes[ctx[3]->state[0] & 3](ctx[3]->state, 200, output + 96);
|
||||
}
|
||||
|
||||
|
||||
#define CN_STEP1(a, b0, b1, c, l, ptr, idx, conc_var) \
|
||||
ptr = reinterpret_cast<__m128i*>(&l[idx & MASK]); \
|
||||
c = _mm_load_si128(ptr); \
|
||||
|
@ -1444,7 +1700,10 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
|
|||
|
||||
for (size_t i = 0; i < 3; i++) {
|
||||
keccak(input + size * i, size, ctx[i]->state);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
|
||||
if (props.half_mem()) {
|
||||
ctx[i]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
|
||||
}
|
||||
|
||||
uint8_t* l0 = ctx[0]->memory;
|
||||
|
@ -1489,7 +1748,7 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < 3; i++) {
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
|
||||
extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
|
||||
}
|
||||
|
@ -1499,6 +1758,14 @@ inline void cryptonight_triple_hash(const uint8_t *__restrict__ input, size_t si
|
|||
template<Algorithm::Id ALGO, bool SOFT_AES>
|
||||
inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size, uint8_t *__restrict__ output, cryptonight_ctx **__restrict__ ctx, uint64_t height)
|
||||
{
|
||||
const auto arch = Cpu::info()->arch();
|
||||
if ((arch >= ICpuInfo::ARCH_ZEN) && (arch <= ICpuInfo::ARCH_ZEN3)) {
|
||||
if ((ALGO == Algorithm::CN_GR_0) || (ALGO == Algorithm::CN_GR_1) || (ALGO == Algorithm::CN_GR_2) || (ALGO == Algorithm::CN_GR_3) || (ALGO == Algorithm::CN_GR_4) || (ALGO == Algorithm::CN_GR_5)) {
|
||||
cryptonight_quad_hash_zen<ALGO, SOFT_AES>(input, size, output, ctx, height);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
constexpr CnAlgo<ALGO> props;
|
||||
constexpr size_t MASK = props.mask();
|
||||
constexpr Algorithm::Id BASE = props.base();
|
||||
|
@ -1518,7 +1785,10 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
|
|||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
keccak(input + size * i, size, ctx[i]->state);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
|
||||
if (props.half_mem()) {
|
||||
ctx[i]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
|
||||
}
|
||||
|
||||
uint8_t* l0 = ctx[0]->memory;
|
||||
|
@ -1571,7 +1841,7 @@ inline void cryptonight_quad_hash(const uint8_t *__restrict__ input, size_t size
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < 4; i++) {
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
|
||||
extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
|
||||
}
|
||||
|
@ -1600,7 +1870,10 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz
|
|||
|
||||
for (size_t i = 0; i < 5; i++) {
|
||||
keccak(input + size * i, size, ctx[i]->state);
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->state), reinterpret_cast<__m128i*>(ctx[i]->memory));
|
||||
if (props.half_mem()) {
|
||||
ctx[i]->first_half = true;
|
||||
}
|
||||
cn_explode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
|
||||
}
|
||||
|
||||
uint8_t* l0 = ctx[0]->memory;
|
||||
|
@ -1661,7 +1934,7 @@ inline void cryptonight_penta_hash(const uint8_t *__restrict__ input, size_t siz
|
|||
}
|
||||
|
||||
for (size_t i = 0; i < 5; i++) {
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(reinterpret_cast<const __m128i*>(ctx[i]->memory), reinterpret_cast<__m128i*>(ctx[i]->state));
|
||||
cn_implode_scratchpad<ALGO, SOFT_AES, 0>(ctx[i]);
|
||||
keccakf(reinterpret_cast<uint64_t*>(ctx[i]->state), 24);
|
||||
extra_hashes[ctx[i]->state[0] & 3](ctx[i]->state, 200, output + 32 * i);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue