AVX2 optimized code for AstroBWT

Added "astrobwt-avx2" parameter in config.json, it's turned off ("false") by default.

4-5% speedup on CPUs with proper AVX2 support (AMD Ryzen starting with Zen2, Intel Core starting with Haswell).

There will be no speedup on the following CPUs:

- Intel Pentium/Celeron don't support AVX2
- AMD Zen/Zen+ have only half-speed AVX

GCC compiled version is faster without AVX2, MSVC compiled version is faster with AVX2
This commit is contained in:
SChernykh 2020-03-10 22:03:16 +01:00
parent 8698b73036
commit e22f798085
14 changed files with 563 additions and 15 deletions

View file

@ -30,6 +30,10 @@
#include "AstroBWT.h"
#include "sha3.h"
#include "crypto/cn/CryptoNight.h"
#include "base/net/stratum/Job.h"
#include "base/crypto/Algorithm.h"
#include "base/io/log/Log.h"
#include "backend/cpu/Cpu.h"
#include <limits>
constexpr int STAGE1_SIZE = 147253;
@ -38,6 +42,18 @@ constexpr int ALLOCATION_SIZE = (STAGE1_SIZE + 1048576) + (128 - (STAGE1_SIZE &
constexpr int COUNTING_SORT_BITS = 10;
constexpr int COUNTING_SORT_SIZE = 1 << COUNTING_SORT_BITS;
static bool astrobwtInitialized = false;
#ifdef ASTROBWT_AVX2
static bool hasAVX2 = false;
extern "C"
#ifdef __GNUC__
__attribute__((ms_abi))
#endif
void SHA3_256_AVX2_ASM(const void* in, size_t inBytes, void* out);
#endif
#ifdef _MSC_VER
#include <stdlib.h>
@ -155,7 +171,25 @@ void sort_indices(int N, const uint8_t* v, uint64_t* indices, uint64_t* tmp_indi
}
}
bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size)
bool xmrig::astrobwt::init(const xmrig::Job& job)
{
if (job.algorithm().family() != xmrig::Algorithm::ASTROBWT)
return true;
if (astrobwtInitialized)
return true;
#ifdef ASTROBWT_AVX2
if (xmrig::Cpu::info()->hasAVX2()) {
hasAVX2 = true;
}
#endif
astrobwtInitialized = true;
return true;
}
bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size, void* scratchpad, uint8_t* output_hash, int stage2_max_size, bool avx2)
{
uint8_t key[32];
uint8_t* scratchpad_ptr = (uint8_t*)(scratchpad) + 64;
@ -166,7 +200,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
uint8_t* stage1_result = (uint8_t*)(tmp_indices);
uint8_t* stage2_result = (uint8_t*)(tmp_indices);
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
#ifdef ASTROBWT_AVX2
if (hasAVX2 && avx2)
SHA3_256_AVX2_ASM(input_data, input_size, key);
else
#endif
sha3_HashBuffer(256, SHA3_FLAGS_NONE, input_data, input_size, key, sizeof(key));
Salsa20_XORKeyStream(key, stage1_output, STAGE1_SIZE);
@ -178,7 +217,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
stage1_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
}
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
#ifdef ASTROBWT_AVX2
if (hasAVX2 && avx2)
SHA3_256_AVX2_ASM(stage1_result, STAGE1_SIZE + 1, key);
else
#endif
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage1_result, STAGE1_SIZE + 1, key, sizeof(key));
const int stage2_size = STAGE1_SIZE + (*(uint32_t*)(key) & 0xfffff);
if (stage2_size > stage2_max_size)
@ -203,7 +247,12 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
stage2_result[i] = tmp[indices[i] & ((1 << 21) - 1)];
}
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
#ifdef ASTROBWT_AVX2
if (hasAVX2 && avx2)
SHA3_256_AVX2_ASM(stage2_result, stage2_size + 1, output_hash);
else
#endif
sha3_HashBuffer(256, SHA3_FLAGS_NONE, stage2_result, stage2_size + 1, output_hash, 32);
return true;
}
@ -211,5 +260,5 @@ bool xmrig::astrobwt::astrobwt_dero(const void* input_data, uint32_t input_size,
template<>
void xmrig::astrobwt::single_hash<xmrig::Algorithm::ASTROBWT_DERO>(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx** ctx, uint64_t)
{
astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max());
astrobwt_dero(input, static_cast<uint32_t>(size), ctx[0]->memory, output, std::numeric_limits<int>::max(), true);
}