From 1ce9d2bf3c93a5dcf33238c69968311d8ff67287 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ben=20Gr=C3=A4f?= Date: Sat, 7 Apr 2018 21:20:24 +0200 Subject: [PATCH] Implemented CN-Heavy algo, Algo tests and fixed n-loop variants (#86) * Debouncing of connection retry when connecting donation server fails * When PoW variant is set on proxy, it will overrule local set PoW * Implementation of cn_heavy algo * Added self test for cn-heavy * Fixed n-loop variant of powV2 and cn-heavy * Fixed n-loop variant of powV2 and added cn-heavy for ARM * Fixing n-loop for arm * Limited cn-heavy to max hashfactor 3 on higher seltest fails. * Removed a lot of casts * Fixed algo selftest --- CMakeLists.txt | 7 +- src/3rdparty/clib-net/src/net.c | 11 +- src/Cpu.cpp | 7 +- src/Mem.cpp | 16 +- src/Mem_unix.cpp | 16 +- src/Mem_win.cpp | 16 +- src/Options.cpp | 30 +- src/Options.h | 1 + src/crypto/CryptoNight.cpp | 145 ++- src/crypto/CryptoNight.h | 1 + src/crypto/CryptoNight_arm.h | 1475 +++++++++++++++++++++---- src/crypto/CryptoNight_test.h | 22 +- src/crypto/CryptoNight_x86.h | 1307 +++++++++++++++++++--- src/net/Client.cpp | 31 +- src/net/strategies/DonateStrategy.cpp | 40 +- src/net/strategies/DonateStrategy.h | 1 + src/version.h | 6 +- 17 files changed, 2682 insertions(+), 450 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ea1e92b..72aa185c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,6 @@ if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) endif(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) option(WITH_LIBCPUID "Use Libcpuid" ON) -option(WITH_AEON "CryptoNight-Lite support" ON) option(WITH_HTTPD "HTTP REST API" OFF) option(WITH_CC_CLIENT "CC Client" ON) option(WITH_CC_SERVER "CC Server" ON) @@ -91,7 +90,7 @@ else() ) set(EXTRA_LIBS pthread rt) - + if (CMAKE_SYSTEM_NAME STREQUAL FreeBSD) set(EXTRA_LIBS ${EXTRA_LIBS} kvm) else() @@ -154,10 +153,6 @@ if (HAVE_SYSLOG_H) set(SOURCES_SYSLOG src/log/SysLog.h src/log/SysLog.cpp) endif() -if (NOT WITH_AEON) - add_definitions(/DXMRIG_NO_AEON) -endif() - if (WITH_HTTPD) find_package(MHD) diff --git a/src/3rdparty/clib-net/src/net.c b/src/3rdparty/clib-net/src/net.c index 8eafc5c1..b1780cb6 100644 --- a/src/3rdparty/clib-net/src/net.c +++ b/src/3rdparty/clib-net/src/net.c @@ -186,9 +186,9 @@ net_resolve_cb(uv_getaddrinfo_t *rv, int err, net_ai * ai) { ret = uv_tcp_connect(net->conn, net->handle, (const struct sockaddr*) &dest, net_connect_cb); if (ret != NET_OK) { if (net->error_cb) { - net->error_cb(net, ret, (char *) uv_strerror(err)); + net->error_cb(net, ret, (char *) uv_strerror(ret)); } else { - printf("error(%s:%d) %s", net->hostname, net->port, (char *) uv_strerror(err)); + printf("error(%s:%d) %s", net->hostname, net->port, (char *) uv_strerror(ret)); net_free(net); } return; @@ -206,12 +206,7 @@ net_connect_cb(uv_connect_t *conn, int err) { int read; if (err < 0) { - if (net->error_cb) { - net->error_cb(net, err, (char *) uv_strerror(err)); - } else { - printf("error(%s:%d) %s", net->hostname, net->port, (char *) uv_strerror(err)); - net_free(net); - } + net_free(net); return; } diff --git a/src/Cpu.cpp b/src/Cpu.cpp index 10ba818a..ddd4642d 100644 --- a/src/Cpu.cpp +++ b/src/Cpu.cpp @@ -55,7 +55,7 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor, Options::Algo algo, size_t maxCpuUsage, bool safeMode) { // limits hashfactor to maximum possible value defined by compiler flag - hashFactor = std::min(hashFactor, static_cast(MAX_NUM_HASH_BLOCKS)); + hashFactor = std::min(hashFactor, algo == Options::ALGO_CRYPTONIGHT_HEAVY ? 3 : static_cast(MAX_NUM_HASH_BLOCKS)); if (!safeMode && threadsCount > 0 && hashFactor > 0) { @@ -69,6 +69,9 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor, case Options::ALGO_CRYPTONIGHT_LITE: algoBlockSize = 1024; break; + case Options::ALGO_CRYPTONIGHT_HEAVY: + algoBlockSize = 4096; + break; case Options::ALGO_CRYPTONIGHT: default: algoBlockSize = 2048; @@ -77,7 +80,7 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor, size_t maximumReasonableFactor = std::max(cache / algoBlockSize, static_cast(1ul)); size_t maximumReasonableThreadCount = std::min(maximumReasonableFactor, m_totalThreads); - size_t maximumReasonableHashFactor = std::min(maximumReasonableFactor, static_cast(MAX_NUM_HASH_BLOCKS)); + size_t maximumReasonableHashFactor = std::min(maximumReasonableFactor, algo == Options::ALGO_CRYPTONIGHT_HEAVY ? 3 : static_cast(MAX_NUM_HASH_BLOCKS)); if (safeMode) { if (threadsCount > maximumReasonableThreadCount) { diff --git a/src/Mem.cpp b/src/Mem.cpp index a7e0fc8a..522deab4 100644 --- a/src/Mem.cpp +++ b/src/Mem.cpp @@ -39,7 +39,21 @@ Mem::ThreadBitSet Mem::m_multiHashThreadMask = Mem::ThreadBitSet(-1L); cryptonight_ctx *Mem::create(int threadId) { - size_t scratchPadSize = m_algo == Options::ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE; + size_t scratchPadSize; + + switch (m_algo) + { + case Options::ALGO_CRYPTONIGHT_LITE: + scratchPadSize = MEMORY_LITE; + break; + case Options::ALGO_CRYPTONIGHT_HEAVY: + scratchPadSize = MEMORY_HEAVY; + break; + case Options::ALGO_CRYPTONIGHT: + default: + scratchPadSize = MEMORY; + break; + } size_t offset = 0; for (int i=0; i < threadId; i++) { diff --git a/src/Mem_unix.cpp b/src/Mem_unix.cpp index 0fd3031b..b52c449b 100644 --- a/src/Mem_unix.cpp +++ b/src/Mem_unix.cpp @@ -46,7 +46,21 @@ bool Mem::allocate(const Options* options) m_multiHashThreadMask = Mem::ThreadBitSet(options->multiHashThreadMask()); m_memorySize = 0; - size_t scratchPadSize = m_algo == Options::ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE; + size_t scratchPadSize; + switch (m_algo) + { + case Options::ALGO_CRYPTONIGHT_LITE: + scratchPadSize = MEMORY_LITE; + break; + case Options::ALGO_CRYPTONIGHT_HEAVY: + scratchPadSize = MEMORY_HEAVY; + break; + case Options::ALGO_CRYPTONIGHT: + default: + scratchPadSize = MEMORY; + break; + } + for (size_t i=0; i < m_threads; i++) { m_memorySize += sizeof(cryptonight_ctx); m_memorySize += scratchPadSize * getThreadHashFactor(i); diff --git a/src/Mem_win.cpp b/src/Mem_win.cpp index 0f46b6b3..a52cc5b5 100644 --- a/src/Mem_win.cpp +++ b/src/Mem_win.cpp @@ -152,7 +152,21 @@ bool Mem::allocate(const Options* options) m_multiHashThreadMask = Mem::ThreadBitSet(options->multiHashThreadMask()); m_memorySize = 0; - size_t scratchPadSize = m_algo == Options::ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE; + size_t scratchPadSize; + switch (m_algo) + { + case Options::ALGO_CRYPTONIGHT_LITE: + scratchPadSize = MEMORY_LITE; + break; + case Options::ALGO_CRYPTONIGHT_HEAVY: + scratchPadSize = MEMORY_HEAVY; + break; + case Options::ALGO_CRYPTONIGHT: + default: + scratchPadSize = MEMORY; + break; + } + for (size_t i=0; i < m_threads; i++) { m_memorySize += sizeof(cryptonight_ctx); m_memorySize += scratchPadSize * getThreadHashFactor(i); diff --git a/src/Options.cpp b/src/Options.cpp index dd8c8f34..a5ada912 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -64,7 +64,7 @@ Usage: " APP_ID " [OPTIONS]\n\ Options:\n" # ifndef XMRIG_CC_SERVER "\ - -a, --algo=ALGO cryptonight (default) or cryptonight-lite\n\ + -a, --algo=ALGO cryptonight (default) / cryptonight-lite or cryptonight-heavy\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -76,6 +76,7 @@ Options:\n" -r, --retries=N number of times to retry before switch to backup server (default: 5)\n\ -R, --retry-pause=N time to pause between retries (default: 5)\n\ --force-pow-version=N force to use specific PoW variation (default: 0 POW_AUTODETECT, 1 POW_V1, 2 POW_V2)\n\ + --multihash-factor=N number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)\n\ --multihash-thread-mask for av=2/4 only, limits multihash to given threads (mask), (default: all threads)\n\ --cpu-affinity set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\ --cpu-priority set process priority (0 idle, 2 normal to 5 highest)\n\ @@ -99,7 +100,7 @@ Options:\n" --cc-update-interval-s=N status update interval in seconds (default: 10 min: 1)\n" # endif # endif - + # ifdef XMRIG_CC_SERVER "\ --cc-user=USERNAME CC Server admin user\n\ @@ -111,13 +112,13 @@ Options:\n" --cc-key-file=FILE when tls is turned on, use this to point to the right key file (default: server.key) \n\ --cc-client-config-folder=FOLDER Folder contains the client config files\n\ --cc-custom-dashboard=FILE loads a custom dashboard and serve it to '/'\n" -# endif +# endif "\ --no-color disable colored output\n" -# ifdef HAVE_SYSLOG_H +# ifdef HAVE_SYSLOG_H "\ -S, --syslog use system log for output messages\n" -# endif +# endif "\ -B, --background run the miner in the background\n\ -c, --config=FILE load a JSON-format configuration file\n\ @@ -257,9 +258,8 @@ static struct option const cc_server_options[] = { static const char *algo_names[] = { "cryptonight", -# ifndef XMRIG_NO_AEON - "cryptonight-lite" -# endif + "cryptonight-lite", + "cryptonight-heavy" }; @@ -616,7 +616,7 @@ bool Options::parseArg(int key, uint64_t arg) break; case 't': /* --threads */ - if (arg < 1 || arg > 1024) { + if (arg < 0 || arg > 1024) { showUsage(1); return false; } @@ -919,12 +919,15 @@ bool Options::setAlgo(const char *algo) break; } -# ifndef XMRIG_NO_AEON if (i == ARRAY_SIZE(algo_names) - 1 && !strcmp(algo, "cryptonight-light")) { m_algo = ALGO_CRYPTONIGHT_LITE; break; } -# endif + + if (i == ARRAY_SIZE(algo_names) - 1 && !strcmp(algo, "cryptonight-heavy")) { + m_algo = ALGO_CRYPTONIGHT_HEAVY; + break; + } if (i == ARRAY_SIZE(algo_names) - 1) { showUsage(1); @@ -977,6 +980,11 @@ void Options::optimizeAlgorithmConfiguration() m_aesni = aesniFromCpu; } + if (m_algo == Options::ALGO_CRYPTONIGHT_HEAVY && m_hashFactor > 3) { + fprintf(stderr, "Maximum supported hashfactor for cryptonight-heavy is: 3\n"); + m_hashFactor = 3; + } + Cpu::optimizeParameters(m_threads, m_hashFactor, m_algo, m_maxCpuUsage, m_safe); } diff --git a/src/Options.h b/src/Options.h index 2e0a8b12..10a9a327 100644 --- a/src/Options.h +++ b/src/Options.h @@ -44,6 +44,7 @@ public: enum Algo { ALGO_CRYPTONIGHT, /* CryptoNight (Monero) */ ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (AEON) */ + ALGO_CRYPTONIGHT_HEAVY /* CryptoNight-Heavy (SUMO) */ }; enum AlgoVariant { diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index 00dab9b3..34b9793d 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -34,7 +34,7 @@ #include "crypto/CryptoNight_test.h" template -static void cryptonight_aesni(const void *input, size_t size, void *output, cryptonight_ctx *ctx) { +static void cryptonight_aesni(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { # if !defined(XMRIG_ARMv7) if ((reinterpret_cast(input)[0] > 6 && Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT) || Options::i()->forcePowVersion() == Options::PowVersion::POW_V2) { @@ -46,7 +46,7 @@ static void cryptonight_aesni(const void *input, size_t size, void *output, cryp } template -static void cryptonight_softaes(const void *input, size_t size, void *output, cryptonight_ctx *ctx) { +static void cryptonight_softaes(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { if ((reinterpret_cast(input)[0] > 6 && Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT) || Options::i()->forcePowVersion() == Options::PowVersion::POW_V2) { CryptoNightMultiHash<0x80000, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); @@ -56,7 +56,7 @@ static void cryptonight_softaes(const void *input, size_t size, void *output, cr } template -static void cryptonight_lite_aesni(const void *input, size_t size, void *output, cryptonight_ctx *ctx) { +static void cryptonight_lite_aesni(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { # if !defined(XMRIG_ARMv7) if ((reinterpret_cast(input)[0] > 1 && Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT) || Options::i()->forcePowVersion() == Options::PowVersion::POW_V2) { @@ -68,7 +68,7 @@ static void cryptonight_lite_aesni(const void *input, size_t size, void *output, } template -static void cryptonight_lite_softaes(const void *input, size_t size, void *output, cryptonight_ctx *ctx) { +static void cryptonight_lite_softaes(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { if ((reinterpret_cast(input)[0] > 1 && Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT) || Options::i()->forcePowVersion() == Options::PowVersion::POW_V2) { CryptoNightMultiHash<0x40000, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); @@ -77,7 +77,20 @@ static void cryptonight_lite_softaes(const void *input, size_t size, void *outpu } } -void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(const void *input, size_t size, void *output, cryptonight_ctx *ctx); + +template +static void cryptonight_heavy_aesni(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { +# if !defined(XMRIG_ARMv7) + CryptoNightMultiHash<0x40000, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, ctx); +# endif +} + +template +static void cryptonight_heavy_softaes(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { + CryptoNightMultiHash<0x40000, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, ctx); +} + +void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx); template void setCryptoNightHashMethods(Options::Algo algo, bool aesni) @@ -98,6 +111,14 @@ void setCryptoNightHashMethods(Options::Algo algo, bool aesni) cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_lite_softaes; } break; + + case Options::ALGO_CRYPTONIGHT_HEAVY: + if (aesni) { + cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_heavy_aesni; + } else { + cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_heavy_softaes; + } + break; } // next iteration setCryptoNightHashMethods(algo, aesni); @@ -139,56 +160,92 @@ bool CryptoNight::selfTest(int algo) return false; } - char output[160]; + uint8_t output[160]; auto ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16); ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 6, 16); bool resultV1Pow = true; - if (Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT || Options::i()->forcePowVersion() == Options::PowVersion::POW_V1) { - cryptonight_hash_ctx[0](test_input, 76, output, ctx); - resultV1Pow = resultV1Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 32) == 0; - -#if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](test_input, 76, output, ctx); - resultV1Pow = resultV1Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 64) == 0; -#endif - -#if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](test_input, 76, output, ctx); - resultV1Pow = resultV1Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 96) == 0; -#endif - -#if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](test_input, 76, output, ctx); - resultV1Pow = resultV1Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 128) == 0; -#endif - -#if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](test_input, 76, output, ctx); - resultV1Pow = resultV1Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, 160) == 0; -#endif - } - - // monero/aeon v2 pow (monero/aeon blockchain version 7) bool resultV2Pow = true; - if (Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT || Options::i()->forcePowVersion() == Options::PowVersion::POW_V2) { - cryptonight_hash_ctx[0](test_input_monero_v2_pow_0, sizeof(test_input_monero_v2_pow_0), output, ctx); - resultV2Pow = resultV2Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_monero_v2_pow_light[0] : test_output_monero_v2_pow[0], 32) == 0; + bool resultHeavy = true; -#if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](test_input_monero_v2_pow_1, sizeof(test_input_monero_v2_pow_1), output, ctx); - resultV2Pow = resultV2Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_monero_v2_pow_light[1] : test_output_monero_v2_pow[1], 32) == 0; -#endif + if (algo == Options::ALGO_CRYPTONIGHT_HEAVY) + { + cryptonight_hash_ctx[0](test_input, 76, output, ctx); + resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 32) == 0; -#if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](test_input_monero_v2_pow_2, sizeof(test_input_monero_v2_pow_2), output, ctx); - resultV2Pow = resultV2Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_monero_v2_pow_light[2] : test_output_monero_v2_pow[2], 32) == 0; -#endif + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](test_input, 76, output, ctx); + resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 64) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 2 + cryptonight_hash_ctx[2](test_input, 76, output, ctx); + resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 96) == 0; + #endif } + else { + if (Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT || + Options::i()->forcePowVersion() == Options::PowVersion::POW_V1) { + cryptonight_hash_ctx[0](test_input, 76, output, ctx); + resultV1Pow = resultV1Pow && + memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, + 32) == 0; + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](test_input, 76, output, ctx); + resultV1Pow = resultV1Pow && + memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, + 64) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 2 + cryptonight_hash_ctx[2](test_input, 76, output, ctx); + resultV1Pow = resultV1Pow && + memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, + 96) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 3 + cryptonight_hash_ctx[3](test_input, 76, output, ctx); + resultV1Pow = resultV1Pow && + memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, + 128) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 4 + cryptonight_hash_ctx[4](test_input, 76, output, ctx); + resultV1Pow = resultV1Pow && + memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE ? test_output_light : test_output, + 160) == 0; + #endif + } + + // monero/aeon v2 pow (monero/aeon blockchain version 7) + if (Options::i()->forcePowVersion() == Options::PowVersion::POW_AUTODETECT || + Options::i()->forcePowVersion() == Options::PowVersion::POW_V2) { + cryptonight_hash_ctx[0](test_input_monero_v2_pow_0, sizeof(test_input_monero_v2_pow_0), output, ctx); + resultV2Pow = resultV2Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE + ? test_output_monero_v2_pow_light[0] + : test_output_monero_v2_pow[0], 32) == 0; + + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](test_input_monero_v2_pow_1, sizeof(test_input_monero_v2_pow_1), output, ctx); + resultV2Pow = resultV2Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE + ? test_output_monero_v2_pow_light[1] + : test_output_monero_v2_pow[1], 32) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 2 + cryptonight_hash_ctx[2](test_input_monero_v2_pow_2, sizeof(test_input_monero_v2_pow_2), output, ctx); + resultV2Pow = resultV2Pow && memcmp(output, algo == Options::ALGO_CRYPTONIGHT_LITE + ? test_output_monero_v2_pow_light[2] + : test_output_monero_v2_pow[2], 32) == 0; + #endif + } + } _mm_free(ctx->memory); _mm_free(ctx); - return resultV1Pow && resultV2Pow; + return resultV1Pow && resultV2Pow & resultHeavy; } \ No newline at end of file diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h index dc6b397b..393b2892 100644 --- a/src/crypto/CryptoNight.h +++ b/src/crypto/CryptoNight.h @@ -32,6 +32,7 @@ #define MEMORY 2097152 /* 2 MiB */ #define MEMORY_LITE 1048576 /* 1 MiB */ +#define MEMORY_HEAVY 4194304 /* 4 MiB */ struct cryptonight_ctx { alignas(16) uint8_t state[MAX_NUM_HASH_BLOCKS][208]; // 208 instead of 200 to maintain aligned to 16 byte boundaries diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index dd3699f9..fbe329dd 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -48,33 +48,27 @@ extern "C" #include "crypto/c_skein.h" } - -static inline void do_blake_hash(const void* input, size_t len, char* output) -{ - blake256_hash(reinterpret_cast(output), static_cast(input), len); +static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) { + blake256_hash(output, input, len); } -static inline void do_groestl_hash(const void* input, size_t len, char* output) -{ - groestl(static_cast(input), len * 8, reinterpret_cast(output)); +static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) { + groestl(input, len * 8, output); } -static inline void do_jh_hash(const void* input, size_t len, char* output) -{ - jh_hash(32 * 8, static_cast(input), 8 * len, reinterpret_cast(output)); +static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) { + jh_hash(32 * 8, input, 8 * len, output); } -static inline void do_skein_hash(const void* input, size_t len, char* output) -{ - xmr_skein(static_cast(input), reinterpret_cast(output)); +static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) { + xmr_skein(input, output); } -void -(* const extra_hashes[4])(const void*, size_t, char*) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; +void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64_t a, const uint64_t b) @@ -101,9 +95,7 @@ static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi) return (uint64_t) r; } #else - -static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) -{ +static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) { // multiplier = ab = a * 2^32 + b // multiplicand = cd = c * 2^32 + d // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d @@ -127,7 +119,6 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin return product_lo; } - #endif @@ -146,38 +137,22 @@ static inline __m128i sl_xor(__m128i tmp1) } -template -static inline void aes_genkey_sub(__m128i* xout0, __m128i* xout2) -{ -// __m128i xout1 = _mm_aeskeygenassist_si128(*xout2, rcon); -// xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem -// *xout0 = sl_xor(*xout0); -// *xout0 = _mm_xor_si128(*xout0, xout1); -// xout1 = _mm_aeskeygenassist_si128(*xout0, 0x00); -// xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem -// *xout2 = sl_xor(*xout2); -// *xout2 = _mm_xor_si128(*xout2, xout1); -} - - template static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2) { __m128i xout1 = soft_aeskeygenassist(*xout2); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem + xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem *xout0 = sl_xor(*xout0); *xout0 = _mm_xor_si128(*xout0, xout1); - xout1 = soft_aeskeygenassist<0x00>(*xout0); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem + xout1 = soft_aeskeygenassist<0x00>(*xout0); + xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem *xout2 = sl_xor(*xout2); *xout2 = _mm_xor_si128(*xout2, xout1); } template -static inline void -aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, - __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) +static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) { __m128i xout0 = _mm_load_si128(memory); __m128i xout2 = _mm_load_si128(memory + 1); @@ -203,9 +178,7 @@ aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i template -static inline void -aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, - __m128i* x7) +static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) { if (SOFT_AES) { *x0 = soft_aesenc((uint32_t*)x0, key); @@ -219,19 +192,32 @@ aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m12 } # ifndef XMRIG_ARMv7 else { - *x0 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x0), key)); - *x1 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x1), key)); - *x2 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x2), key)); - *x3 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x3), key)); - *x4 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x4), key)); - *x5 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x5), key)); - *x6 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x6), key)); - *x7 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t*) x7), key)); + *x0 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x0), key)); + *x1 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x1), key)); + *x2 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x2), key)); + *x3 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x3), key)); + *x4 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x4), key)); + *x5 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x5), key)); + *x6 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x6), key)); + *x7 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x7), key)); } # endif } +inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7) +{ + __m128i tmp0 = x0; + x0 = _mm_xor_si128(x0, x1); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_xor_si128(x2, x3); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_xor_si128(x4, x5); + x5 = _mm_xor_si128(x5, x6); + x6 = _mm_xor_si128(x6, x7); + x7 = _mm_xor_si128(x7, tmp0); +} + template static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output) { @@ -288,6 +274,93 @@ static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output) } } +template +static inline void cn_explode_scratchpad_heavy(const __m128i* input, __m128i* output) +{ + __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xin0 = _mm_load_si128(input + 4); + xin1 = _mm_load_si128(input + 5); + xin2 = _mm_load_si128(input + 6); + xin3 = _mm_load_si128(input + 7); + xin4 = _mm_load_si128(input + 8); + xin5 = _mm_load_si128(input + 9); + xin6 = _mm_load_si128(input + 10); + xin7 = _mm_load_si128(input + 11); + + for (size_t i = 0; i < 16; i++) { + if (!SOFT_AES) { + aes_round(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + } + + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + + if (!SOFT_AES) { + xin0 ^= k9; + xin1 ^= k9; + xin2 ^= k9; + xin3 ^= k9; + xin4 ^= k9; + xin5 ^= k9; + xin6 ^= k9; + xin7 ^= k9; + } + else { + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + } + + mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7); + } + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + if (!SOFT_AES) { + aes_round(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + } + + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + + if (!SOFT_AES) { + xin0 ^= k9; + xin1 ^= k9; + xin2 ^= k9; + xin3 ^= k9; + xin4 ^= k9; + xin5 ^= k9; + xin6 ^= k9; + xin7 ^= k9; + } else { + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + } + + _mm_store_si128(output + i + 0, xin0); + _mm_store_si128(output + i + 1, xin1); + _mm_store_si128(output + i + 2, xin2); + _mm_store_si128(output + i + 3, xin3); + _mm_store_si128(output + i + 4, xin4); + _mm_store_si128(output + i + 5, xin5); + _mm_store_si128(output + i + 6, xin6); + _mm_store_si128(output + i + 7, xin7); + } +} template static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output) @@ -354,14 +427,156 @@ static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } +template +static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* output) +{ + __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xout0 = _mm_load_si128(output + 4); + xout1 = _mm_load_si128(output + 5); + xout2 = _mm_load_si128(output + 6); + xout3 = _mm_load_si128(output + 7); + xout4 = _mm_load_si128(output + 8); + xout5 = _mm_load_si128(output + 9); + xout6 = _mm_load_si128(output + 10); + xout7 = _mm_load_si128(output + 11); + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) + { + xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + + if (!SOFT_AES) { + aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + if (!SOFT_AES) { + xout0 ^= k9; + xout1 ^= k9; + xout2 ^= k9; + xout3 ^= k9; + xout4 ^= k9; + xout5 ^= k9; + xout6 ^= k9; + xout7 ^= k9; + } + else { + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + + if (!SOFT_AES) { + aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + if (!SOFT_AES) { + xout0 ^= k9; + xout1 ^= k9; + xout2 ^= k9; + xout3 ^= k9; + xout4 ^= k9; + xout5 ^= k9; + xout6 ^= k9; + xout7 ^= k9; + } + else { + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + for (size_t i = 0; i < 16; i++) { + if (!SOFT_AES) { + aes_round(_mm_setzero_si128(), &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + if (!SOFT_AES) { + xout0 ^= k9; + xout1 ^= k9; + xout2 ^= k9; + xout3 ^= k9; + xout4 ^= k9; + xout5 ^= k9; + xout6 ^= k9; + xout7 ^= k9; + } + else { + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + } + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + _mm_store_si128(output + 4, xout0); + _mm_store_si128(output + 5, xout1); + _mm_store_si128(output + 6, xout2); + _mm_store_si128(output + 7, xout3); + _mm_store_si128(output + 8, xout4); + _mm_store_si128(output + 9, xout5); + _mm_store_si128(output + 10, xout6); + _mm_store_si128(output + 11, xout7); +} + // n-Loop version. Seems to be little bit slower then the hardcoded one. template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { const uint8_t* l[NUM_HASH_BLOCKS]; @@ -370,21 +585,87 @@ public: uint64_t ah[NUM_HASH_BLOCKS]; __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - uint64_t version[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); - version[hashBlock] = static_cast(input)[hashBlock * size]; - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ - *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = ctx->memory + hashBlock * MEM; + h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + + cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + } + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + __m128i cx; + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + } else { + cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); +# ifndef XMRIG_ARMv7 + cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); +# endif } + + _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx)); + + idx[hashBlock] = EXTRACT64(cx); + bx[hashBlock] = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + lo = __umul128(idx[hashBlock], cl, &hi); + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; } } + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + output + hashBlock * 32); + } + } + + inline static void hashPowV2(const uint8_t* __restrict__ input, + size_t size, + uint8_t *__restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t tweak1_2[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + tweak1_2[hashBlock] = (*reinterpret_cast(input + 35 + hashBlock * size) ^ + *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + } + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { l[hashBlock] = ctx->memory + hashBlock * MEM; h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); @@ -403,23 +684,21 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); +# ifndef XMRIG_ARMv7 + cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); +# endif } _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - } - } + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx[hashBlock] = EXTRACT64(cx); bx[hashBlock] = cx; @@ -432,20 +711,12 @@ public: al[hashBlock] += hi; ah[hashBlock] += lo; - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - ah[hashBlock] ^= tweak1_2[hashBlock]; - } - } + ah[hashBlock] ^= tweak1_2[hashBlock]; ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - ah[hashBlock] ^= tweak1_2[hashBlock]; - } - } + ah[hashBlock] ^= tweak1_2[hashBlock]; ah[hashBlock] ^= ch; al[hashBlock] ^= cl; @@ -457,16 +728,88 @@ public: cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, - static_cast(output) + hashBlock * 32); + output + hashBlock * 32); } } - inline static void hashPowV2(const void* __restrict__ input, - size_t size, - void* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t *__restrict__ output, + cryptonight_ctx* __restrict__ ctx) { - return hash(input, size, output, ctx); + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, + ctx->state[hashBlock], 200); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = ctx->memory + hashBlock * MEM; + h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + + cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + } + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + __m128i cx; + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + } else { + cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); +# ifndef XMRIG_ARMv7 + cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); +# endif + } + + _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx)); + + idx[hashBlock] = EXTRACT64(cx); + bx[hashBlock] = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + lo = __umul128(idx[hashBlock], cl, &hi); + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; + + int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; + int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; + idx[hashBlock] = d ^ q; + } + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + output + hashBlock * 32); + } } }; @@ -474,9 +817,9 @@ template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { const uint8_t* l; @@ -509,7 +852,7 @@ public: # ifndef XMRIG_ARMv7 cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al); # endif - } + } _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); idx = EXTRACT64(cx); @@ -533,12 +876,12 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - } + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { const uint8_t* l; @@ -550,10 +893,10 @@ public: keccak(static_cast(input), (int) size, ctx->state[0], 200); - uint64_t tweak1_2 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + uint64_t tweak1_2 = (*reinterpret_cast(input + 35) ^ + *(reinterpret_cast(ctx->state[0]) + 24)); + l = ctx->memory; + h = reinterpret_cast(ctx->state[0]); cn_explode_scratchpad((__m128i*) h, (__m128i*) l); @@ -575,13 +918,13 @@ public: # endif } - _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); - const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - idx = EXTRACT64(cx); - bx = cx; + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + idx = EXTRACT64(cx); + bx = cx; uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l[idx & MASK])[0]; @@ -591,10 +934,10 @@ public: al += hi; ah += lo; - ah ^= tweak1_2; - ((uint64_t*) &l[idx & MASK])[0] = al; - ((uint64_t*) &l[idx & MASK])[1] = ah; - ah ^= tweak1_2; + ah ^= tweak1_2; + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + ah ^= tweak1_2; ah ^= ch; al ^= cl; @@ -603,21 +946,90 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t *__restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + const uint8_t* l; + uint64_t* h; + uint64_t al; + uint64_t ah; + __m128i bx; + uint64_t idx; + + keccak(static_cast(input), (int) size, ctx->state[0], 200); + + l = ctx->memory; + h = reinterpret_cast(ctx->state[0]); + + cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); + + al = h[0] ^ h[4]; + ah = h[1] ^ h[5]; + bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + idx = h[0] ^ h[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); + } + else { + cx = _mm_load_si128((__m128i *) &l[idx & MASK]); +# ifndef XMRIG_ARMv7 + cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al); +# endif + } + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + idx = EXTRACT64(cx); + bx = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + lo = __umul128(idx, cl, &hi); + + al += hi; + ah += lo; + + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + + ah ^= ch; + al ^= cl; + idx = al; + + int64_t n = ((int64_t*)&l[idx & MASK])[0]; + int32_t d = ((int32_t*)&l[idx & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l[idx & MASK])[0] = n ^ q; + idx = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + } }; template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; @@ -700,21 +1112,21 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); - uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ *(reinterpret_cast(ctx->state[1]) + 24)); const uint8_t* l0 = ctx->memory; @@ -810,23 +1222,130 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t *__restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + +# ifndef XMRIG_ARMv7 + cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); + cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); +# endif + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + } }; template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; @@ -942,25 +1461,25 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); - uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ *(reinterpret_cast(ctx->state[1]) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ *(reinterpret_cast(ctx->state[2]) + 24)); const uint8_t* l0 = ctx->memory; @@ -1094,25 +1613,174 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t *__restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + const uint8_t* l2 = ctx->memory + 2 * MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + uint64_t* h2 = reinterpret_cast(ctx->state[2]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } + else { + cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]); +# ifndef XMRIG_ARMv7 + cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); + cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); + cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); +# endif + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + n = ((int64_t*)&l2[idx2 & MASK])[0]; + d = ((int32_t*)&l2[idx2 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; + idx2 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + } }; template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input + 3 * size, (int) size, ctx->state[3], 200); const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; @@ -1259,29 +1927,29 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input + 3 * size, (int) size, ctx->state[3], 200); - uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ *(reinterpret_cast(ctx->state[1]) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ *(reinterpret_cast(ctx->state[2]) + 24)); - uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ + uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ *(reinterpret_cast(ctx->state[3]) + 24)); const uint8_t* l0 = ctx->memory; @@ -1451,27 +2119,216 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t *__restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input + 3 * size, (int) size, ctx->state[3], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + const uint8_t* l2 = ctx->memory + 2 * MEM; + const uint8_t* l3 = ctx->memory + 3 * MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + uint64_t* h2 = reinterpret_cast(ctx->state[2]); + uint64_t* h3 = reinterpret_cast(ctx->state[3]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad_heavy((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + } else { +# ifndef XMRIG_ARMv7 + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); + cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); + cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); + cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); +# endif + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + n = ((int64_t*)&l2[idx2 & MASK])[0]; + d = ((int32_t*)&l2[idx2 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; + idx2 = d ^ q; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + n = ((int64_t*)&l3[idx3 & MASK])[0]; + d = ((int32_t*)&l3[idx3 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l3[idx3 & MASK])[0] = n ^ q; + idx3 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad_heavy((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + } }; template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); - keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input + 3 * size, (int) size, ctx->state[3], 200); + keccak(input + 4 * size, (int) size, ctx->state[4], 200); const uint8_t* l0 = ctx->memory; const uint8_t* l1 = ctx->memory + MEM; @@ -1648,33 +2505,33 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, static_cast(output) + 128); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t *__restrict__ output, cryptonight_ctx* __restrict__ ctx) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); - keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200); + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input + 3 * size, (int) size, ctx->state[3], 200); + keccak(input + 4 * size, (int) size, ctx->state[4], 200); - uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ *(reinterpret_cast(ctx->state[1]) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ *(reinterpret_cast(ctx->state[2]) + 24)); - uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ + uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ *(reinterpret_cast(ctx->state[3]) + 24)); - uint64_t tweak1_2_4 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 4 * size) ^ + uint64_t tweak1_2_4 = (*reinterpret_cast(input + 35 + 4 * size) ^ *(reinterpret_cast(ctx->state[4]) + 24)); @@ -1880,12 +2737,240 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, static_cast(output) + 128); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t *__restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak(input, (int) size, ctx->state[0], 200); + keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input + 3 * size, (int) size, ctx->state[3], 200); + keccak(input + 4 * size, (int) size, ctx->state[4], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + const uint8_t* l2 = ctx->memory + 2 * MEM; + const uint8_t* l3 = ctx->memory + 3 * MEM; + const uint8_t* l4 = ctx->memory + 4 * MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + uint64_t* h2 = reinterpret_cast(ctx->state[2]); + uint64_t* h3 = reinterpret_cast(ctx->state[3]); + uint64_t* h4 = reinterpret_cast(ctx->state[4]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad_heavy((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad_heavy((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); + } else { +# ifndef XMRIG_ARMv7 + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); + cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); + cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); + cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); + cx4 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx4, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah4, al4); +# endif + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + bx4 = cx4; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + n = ((int64_t*)&l2[idx2 & MASK])[0]; + d = ((int32_t*)&l2[idx2 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; + idx2 = d ^ q; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + n = ((int64_t*)&l3[idx3 & MASK])[0]; + d = ((int32_t*)&l3[idx3 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l3[idx3 & MASK])[0] = n ^ q; + idx3 = d ^ q; + + + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; + lo = __umul128(idx4, cl, &hi); + + al4 += hi; + ah4 += lo; + + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; + + n = ((int64_t*)&l4[idx4 & MASK])[0]; + d = ((int32_t*)&l4[idx4 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l4[idx4 & MASK])[0] = n ^ q; + idx4 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad_heavy((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad_heavy((__m128i*) l4, (__m128i*) h4); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); + } }; #endif /* __CRYPTONIGHT_ARM_H__ */ diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 0bf62ae9..3c103693 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -64,11 +64,13 @@ const static uint8_t test_output[] = { 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, + 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, - 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, + + 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00 @@ -80,10 +82,12 @@ const static uint8_t test_output_light[] = { 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, + 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, + 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, @@ -131,10 +135,22 @@ const static uint8_t test_output_monero_v2_pow_light[3][32] = { 0x03, 0xf5, 0x39, 0x53, 0x15, 0xde, 0x91, 0x9a, 0xcf, 0x1b, 0x97, 0xf0, 0xa8, 0x4f, 0xbd, 0x2d} }; +const static uint8_t test_output_heavy[] = { + 0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A, + 0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D, + 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, + 0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2, + 0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A, + 0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D, + 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, + 0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2, - - + 0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A, + 0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D, + 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, + 0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2 +}; #endif /* __CRYPTONIGHT_TEST_H__ */ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 2e954a28..4e602e7e 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -48,32 +48,27 @@ extern "C" #include "crypto/c_skein.h" } -static inline void do_blake_hash(const void* input, size_t len, char* output) -{ - blake256_hash(reinterpret_cast(output), static_cast(input), len); +static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) { + blake256_hash(output, input, len); } -static inline void do_groestl_hash(const void* input, size_t len, char* output) -{ - groestl(static_cast(input), len * 8, reinterpret_cast(output)); +static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) { + groestl(input, len * 8, output); } -static inline void do_jh_hash(const void* input, size_t len, char* output) -{ - jh_hash(32 * 8, static_cast(input), 8 * len, reinterpret_cast(output)); +static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) { + jh_hash(32 * 8, input, 8 * len, output); } -static inline void do_skein_hash(const void* input, size_t len, char* output) -{ - xmr_skein(static_cast(input), reinterpret_cast(output)); +static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) { + xmr_skein(input, output); } -void (* const extra_hashes[4])(const void*, size_t, char*) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; - +void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; #if defined(__x86_64__) || defined(_M_AMD64) # define EXTRACT64(X) _mm_cvtsi128_si64(X) @@ -224,6 +219,18 @@ aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m12 } } +inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7) +{ + __m128i tmp0 = x0; + x0 = _mm_xor_si128(x0, x1); + x1 = _mm_xor_si128(x1, x2); + x2 = _mm_xor_si128(x2, x3); + x3 = _mm_xor_si128(x3, x4); + x4 = _mm_xor_si128(x4, x5); + x5 = _mm_xor_si128(x5, x6); + x6 = _mm_xor_si128(x6, x7); + x7 = _mm_xor_si128(x7, tmp0); +} template static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output) @@ -265,6 +272,61 @@ static inline void cn_explode_scratchpad(const __m128i* input, __m128i* output) } } +template +static inline void cn_explode_scratchpad_heavy(const __m128i* input, __m128i* output) +{ + __m128i xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey(input, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xin0 = _mm_load_si128(input + 4); + xin1 = _mm_load_si128(input + 5); + xin2 = _mm_load_si128(input + 6); + xin3 = _mm_load_si128(input + 7); + xin4 = _mm_load_si128(input + 8); + xin5 = _mm_load_si128(input + 9); + xin6 = _mm_load_si128(input + 10); + xin7 = _mm_load_si128(input + 11); + + for (size_t i = 0; i < 16; i++) { + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + + mix_and_propagate(xin0, xin1, xin2, xin3, xin4, xin5, xin6, xin7); + } + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + aes_round(k0, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k1, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k2, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k3, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k4, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k5, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k6, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k7, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k8, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); + + _mm_store_si128(output + i + 0, xin0); + _mm_store_si128(output + i + 1, xin1); + _mm_store_si128(output + i + 2, xin2); + _mm_store_si128(output + i + 3, xin3); + _mm_store_si128(output + i + 4, xin4); + _mm_store_si128(output + i + 5, xin5); + _mm_store_si128(output + i + 6, xin6); + _mm_store_si128(output + i + 7, xin7); + } +} + template static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output) @@ -315,14 +377,104 @@ static inline void cn_implode_scratchpad(const __m128i* input, __m128i* output) _mm_store_si128(output + 11, xout7); } +template +static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* output) +{ + __m128i xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7; + __m128i k0, k1, k2, k3, k4, k5, k6, k7, k8, k9; + + aes_genkey(output + 2, &k0, &k1, &k2, &k3, &k4, &k5, &k6, &k7, &k8, &k9); + + xout0 = _mm_load_si128(output + 4); + xout1 = _mm_load_si128(output + 5); + xout2 = _mm_load_si128(output + 6); + xout3 = _mm_load_si128(output + 7); + xout4 = _mm_load_si128(output + 8); + xout5 = _mm_load_si128(output + 9); + xout6 = _mm_load_si128(output + 10); + xout7 = _mm_load_si128(output + 11); + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { + xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); + xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); + xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); + xout3 = _mm_xor_si128(_mm_load_si128(input + i + 3), xout3); + xout4 = _mm_xor_si128(_mm_load_si128(input + i + 4), xout4); + xout5 = _mm_xor_si128(_mm_load_si128(input + i + 5), xout5); + xout6 = _mm_xor_si128(_mm_load_si128(input + i + 6), xout6); + xout7 = _mm_xor_si128(_mm_load_si128(input + i + 7), xout7); + + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + for (size_t i = 0; i < 16; i++) { + aes_round(k0, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k1, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k2, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k3, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k4, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k5, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k6, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k7, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k8, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); + + mix_and_propagate(xout0, xout1, xout2, xout3, xout4, xout5, xout6, xout7); + } + + _mm_store_si128(output + 4, xout0); + _mm_store_si128(output + 5, xout1); + _mm_store_si128(output + 6, xout2); + _mm_store_si128(output + 7, xout3); + _mm_store_si128(output + 8, xout4); + _mm_store_si128(output + 9, xout5); + _mm_store_si128(output + 10, xout6); + _mm_store_si128(output + 11, xout7); +} + // n-Loop version. Seems to be little bit slower then the hardcoded one. template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { const uint8_t* l[NUM_HASH_BLOCKS]; @@ -331,18 +483,9 @@ public: uint64_t ah[NUM_HASH_BLOCKS]; __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - uint64_t version[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); - version[hashBlock] = static_cast(input)[hashBlock * size]; - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ - *(reinterpret_cast(ctx->state[hashBlock]) + 24)); - } - } } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { @@ -372,15 +515,85 @@ public: _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - } + idx[hashBlock] = EXTRACT64(cx); + bx[hashBlock] = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + lo = __umul128(idx[hashBlock], cl, &hi); + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; + } + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + output + hashBlock * 32); + } + } + + inline static void hashPowV2(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t tweak1_2[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ + *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = ctx->memory + hashBlock * MEM; + h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + + cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx[hashBlock] = + _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + } + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + __m128i cx; + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + } else { + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } + _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx)); + + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + idx[hashBlock] = EXTRACT64(cx); bx[hashBlock] = cx; @@ -392,20 +605,12 @@ public: al[hashBlock] += hi; ah[hashBlock] += lo; - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - ah[hashBlock] ^= tweak1_2[hashBlock]; - } - } + ah[hashBlock] ^= tweak1_2[hashBlock]; ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - /*if (MONERO)*/ { - if (version[hashBlock] > 6) { - ah[hashBlock] ^= tweak1_2[hashBlock]; - } - } + ah[hashBlock] ^= tweak1_2[hashBlock]; ah[hashBlock] ^= ch; al[hashBlock] ^= cl; @@ -417,26 +622,96 @@ public: cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, - static_cast(output) + hashBlock * 32); + output + hashBlock * 32); } } - inline static void hashPowV2(const void* __restrict__ input, - size_t size, - void* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) { - hash(input, size, output, ctx); + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = ctx->memory + hashBlock * MEM; + h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + + cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + } + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + __m128i cx; + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + } else { + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + } + + _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx)); + + idx[hashBlock] = EXTRACT64(cx); + bx[hashBlock] = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + lo = __umul128(idx[hashBlock], cl, &hi); + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; + + int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; + int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; + idx[hashBlock] = d ^ q; + } + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + output + hashBlock * 32); + } } }; + template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { const uint8_t* l; @@ -490,84 +765,150 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); } - inline static void hashPowV2(const void* __restrict__ input, - size_t size, - void* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) - { - const uint8_t* l; - uint64_t* h; - uint64_t al; - uint64_t ah; - __m128i bx; - uint64_t idx; + inline static void hashPowV2(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + const uint8_t* l; + uint64_t* h; + uint64_t al; + uint64_t ah; + __m128i bx; + uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, ctx->state[0], 200); - uint64_t tweak1_2 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + uint64_t tweak1_2 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(ctx->state[0]) + 24)); + l = ctx->memory; + h = reinterpret_cast(ctx->state[0]); - cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); - al = h[0] ^ h[4]; - ah = h[1] ^ h[5]; - bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); - idx = h[0] ^ h[4]; + al = h[0] ^ h[4]; + ah = h[1] ^ h[5]; + bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + idx = h[0] ^ h[4]; - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); - } else { - cx = _mm_load_si128((__m128i*) &l[idx & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); - } + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); + } - _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); - const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - idx = EXTRACT64(cx); - bx = cx; + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + idx = EXTRACT64(cx); + bx = cx; - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[idx & MASK])[0]; - ch = ((uint64_t*) &l[idx & MASK])[1]; - lo = __umul128(idx, cl, &hi); + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + lo = __umul128(idx, cl, &hi); - al += hi; - ah += lo; + al += hi; + ah += lo; - ah ^= tweak1_2; - ((uint64_t*) &l[idx & MASK])[0] = al; - ((uint64_t*) &l[idx & MASK])[1] = ah; - ah ^= tweak1_2; + ah ^= tweak1_2; + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + ah ^= tweak1_2; - ah ^= ch; - al ^= cl; - idx = al; - } + ah ^= ch; + al ^= cl; + idx = al; + } - cn_implode_scratchpad((__m128i*) l, (__m128i*) h); - keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - } + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + const uint8_t* l; + uint64_t* h; + uint64_t al; + uint64_t ah; + __m128i bx; + uint64_t idx; + + keccak(static_cast(input), (int) size, ctx->state[0], 200); + + l = ctx->memory; + h = reinterpret_cast(ctx->state[0]); + + cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); + + al = h[0] ^ h[4]; + ah = h[1] ^ h[5]; + bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + idx = h[0] ^ h[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); + } + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + idx = EXTRACT64(cx); + bx = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + lo = __umul128(idx, cl, &hi); + + al += hi; + ah += lo; + + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + + ah ^= ch; + al ^= cl; + idx = al; + + int64_t n = ((int64_t*)&l[idx & MASK])[0]; + int32_t d = ((int32_t*)&l[idx & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l[idx & MASK])[0] = n ^ q; + idx = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + } }; template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -652,13 +993,13 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -760,8 +1101,114 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); + keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); } }; @@ -769,9 +1216,9 @@ template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -890,14 +1337,14 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -1040,19 +1487,166 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); + keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + const uint8_t* l2 = ctx->memory + 2 * MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + uint64_t* h2 = reinterpret_cast(ctx->state[2]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + n = ((int64_t*)&l2[idx2 & MASK])[0]; + d = ((int32_t*)&l2[idx2 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; + idx2 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + } }; template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -1203,15 +1797,15 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -1393,20 +1987,207 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); } + + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); + keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + const uint8_t* l2 = ctx->memory + 2 * MEM; + const uint8_t* l3 = ctx->memory + 3 * MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + uint64_t* h2 = reinterpret_cast(ctx->state[2]); + uint64_t* h3 = reinterpret_cast(ctx->state[3]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad_heavy((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + n = ((int64_t*)&l2[idx2 & MASK])[0]; + d = ((int32_t*)&l2[idx2 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; + idx2 = d ^ q; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + n = ((int64_t*)&l3[idx3 & MASK])[0]; + d = ((int32_t*)&l3[idx3 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l3[idx3 & MASK])[0] = n ^ q; + idx3 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad_heavy((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + } }; template class CryptoNightMultiHash { public: - inline static void hash(const void* __restrict__ input, + inline static void hash(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -1588,16 +2369,16 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, static_cast(output) + 128); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); } - inline static void hashPowV2(const void* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - void* __restrict__ output, + uint8_t* __restrict__ output, cryptonight_ctx* __restrict__ ctx) { keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); @@ -1818,12 +2599,238 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, static_cast(output)); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, static_cast(output) + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, static_cast(output) + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, static_cast(output) + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, static_cast(output) + 128); + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); } -}; + inline static void hashHeavy(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + cryptonight_ctx* __restrict__ ctx) + { + keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); + keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200); + + const uint8_t* l0 = ctx->memory; + const uint8_t* l1 = ctx->memory + MEM; + const uint8_t* l2 = ctx->memory + 2 * MEM; + const uint8_t* l3 = ctx->memory + 3 * MEM; + const uint8_t* l4 = ctx->memory + 4 * MEM; + uint64_t* h0 = reinterpret_cast(ctx->state[0]); + uint64_t* h1 = reinterpret_cast(ctx->state[1]); + uint64_t* h2 = reinterpret_cast(ctx->state[2]); + uint64_t* h3 = reinterpret_cast(ctx->state[3]); + uint64_t* h4 = reinterpret_cast(ctx->state[4]); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad_heavy((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad_heavy((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + bx4 = cx4; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + n = ((int64_t*)&l2[idx2 & MASK])[0]; + d = ((int32_t*)&l2[idx2 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; + idx2 = d ^ q; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + n = ((int64_t*)&l3[idx3 & MASK])[0]; + d = ((int32_t*)&l3[idx3 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l3[idx3 & MASK])[0] = n ^ q; + idx3 = d ^ q; + + + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; + lo = __umul128(idx4, cl, &hi); + + al4 += hi; + ah4 += lo; + + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; + + n = ((int64_t*)&l4[idx4 & MASK])[0]; + d = ((int32_t*)&l4[idx4 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l4[idx4 & MASK])[0] = n ^ q; + idx4 = d ^ q; + } + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad_heavy((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad_heavy((__m128i*) l4, (__m128i*) h4); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); + + extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); + } + +}; #endif /* __CRYPTONIGHT_X86_H__ */ diff --git a/src/net/Client.cpp b/src/net/Client.cpp index 4642069f..0b81e8a5 100644 --- a/src/net/Client.cpp +++ b/src/net/Client.cpp @@ -209,20 +209,18 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code) if (params.HasMember("variant")) { int variantFromProxy = params["variant"].GetInt(); - if (Options::i()->forcePowVersion() == Options::POW_AUTODETECT) { - switch (variantFromProxy) { - case -1: - Options::i()->setForcePowVersion(Options::POW_AUTODETECT); - break; - case 0: - Options::i()->setForcePowVersion(Options::POW_V1); - break; - case 1: - Options::i()->setForcePowVersion(Options::POW_V2); - break; - default: - break; - } + switch (variantFromProxy) { + case -1: + Options::i()->setForcePowVersion(Options::POW_AUTODETECT); + break; + case 0: + Options::i()->setForcePowVersion(Options::POW_V1); + break; + case 1: + Options::i()->setForcePowVersion(Options::POW_V2); + break; + default: + break; } } @@ -584,7 +582,10 @@ void Client::reconnect() { # endif if (m_failures == -1) { - LOG_DEBUG("Client::onConnect -> m_failures == -1"); + LOG_DEBUG("Client::reconnect -> m_failures == -1"); + m_failures = 0; + m_expire = 0; + return m_listener->onClose(this, -1); } diff --git a/src/net/strategies/DonateStrategy.cpp b/src/net/strategies/DonateStrategy.cpp index 112c78c2..47e8e219 100644 --- a/src/net/strategies/DonateStrategy.cpp +++ b/src/net/strategies/DonateStrategy.cpp @@ -23,6 +23,7 @@ */ +#include #include "interfaces/IStrategyListener.h" #include "net/Client.h" #include "net/Job.h" @@ -55,20 +56,28 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) : Url *url; #ifndef XMRIG_NO_TLS - if (Options::i()->forcePowVersion() == Options::POW_V1) { - url = new Url("donate.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 8080 : 8081, userId, nullptr, true, false, true); - } else if (Options::i()->forcePowVersion() == Options::POW_V2) { - url = new Url("donate2.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 995 : 993, userId, nullptr, true, false, true); + if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_HEAVY) { + url = new Url("donate2.graef.in", 8443, userId, nullptr, true, false, true); } else { - url = new Url("donate2.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 8081 : 443, userId, nullptr, true, false, true); + if (Options::i()->forcePowVersion() == Options::POW_V1) { + url = new Url("donate.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 8080 : 8081, userId, nullptr, true, false, true); + } else if (Options::i()->forcePowVersion() == Options::POW_V2) { + url = new Url("donate2.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 995 : 993, userId, nullptr, true, false, true); + } else { + url = new Url("donate2.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 8081 : 443, userId, nullptr, true, false, true); + } } #else - if (Options::i()->forcePowVersion() == Options::POW_V1) { - url = new Url("donate.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 80 : 443, userId, nullptr, false, false, true); - } else if (Options::i()->forcePowVersion() == Options::POW_V2) { - url = new Url("donate.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 995 : 993, userId, nullptr, false, false, true); + if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_HEAVY) { + url = new Url("donate.graef.in", 8443, userId, nullptr, false, false, true); } else { - url = new Url("donate2.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 8080 : 80, userId, nullptr, false, false, true); + if (Options::i()->forcePowVersion() == Options::POW_V1) { + url = new Url("donate.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 80 : 443, userId, nullptr, false, false, true); + } else if (Options::i()->forcePowVersion() == Options::POW_V2) { + url = new Url("donate.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 995 : 993, userId, nullptr, false, false, true); + } else { + url = new Url("donate2.graef.in", Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE ? 8080 : 80, userId, nullptr, false, false, true); + } } #endif @@ -113,6 +122,11 @@ void DonateStrategy::tick(uint64_t now) void DonateStrategy::onClose(Client *client, int failures) { + if (failures == 5) { + LOG_ERR("Failed to connect to donate address. Reschedule."); + uv_timer_stop(&m_timer); + uv_timer_start(&m_timer, DonateStrategy::onSuspendTimer, 1000, 0); + } } @@ -166,3 +180,9 @@ void DonateStrategy::onTimer(uv_timer_t *handle) strategy->suspend(); } + +void DonateStrategy::onSuspendTimer(uv_timer_t *handle) +{ + auto strategy = static_cast(handle->data); + strategy->suspend(); +} \ No newline at end of file diff --git a/src/net/strategies/DonateStrategy.h b/src/net/strategies/DonateStrategy.h index f7ac2919..6269d191 100644 --- a/src/net/strategies/DonateStrategy.h +++ b/src/net/strategies/DonateStrategy.h @@ -62,6 +62,7 @@ private: void suspend(); static void onTimer(uv_timer_t *handle); + static void onSuspendTimer(uv_timer_t *handle); bool m_active; Client *m_client; diff --git a/src/version.h b/src/version.h index 738dac3b..3e2f4fdb 100644 --- a/src/version.h +++ b/src/version.h @@ -36,14 +36,14 @@ #define APP_DESC "XMRigCC CPU miner" #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id" #endif -#define APP_VERSION "1.5.5 (based on XMRig 2.5.2)" +#define APP_VERSION "1.6.0_cn_heavy_support_v1 (based on XMRig 2.5.2)" #define APP_DOMAIN "" #define APP_SITE "https://github.com/Bendr0id/xmrigCC" #define APP_KIND "cpu" #define APP_VER_MAJOR 1 -#define APP_VER_MINOR 5 -#define APP_VER_BUILD 5 +#define APP_VER_MINOR 6 +#define APP_VER_BUILD 0 #define APP_VER_REV 0 #ifndef NDEBUG