diff --git a/CMakeLists.txt b/CMakeLists.txt index 25828c3e..c4e30ea1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,7 @@ option(WITH_HTTPD "HTTP REST API" OFF) option(WITH_CC_CLIENT "CC Client" ON) option(WITH_CC_SERVER "CC Server" ON) option(WITH_TLS "TLS support" ON) +option(WITH_ASM "ASM optimizations" ON) option(BUILD_STATIC "Build static binary" OFF) set(Boost_USE_STATIC_RUNTIME ON) set(Boost_USE_STATIC_LIBS ON) @@ -128,7 +129,7 @@ find_package(UV REQUIRED) if (WIN32) add_definitions(-DBOOST_ALL_NO_LIB) -endif() +endif(WIN32) find_package(Boost 1.63.0 COMPONENTS system REQUIRED) @@ -144,10 +145,10 @@ if (WITH_TLS) set(SOURCES_SSL_TLS src/net/BoostTlsConnection.cpp) else() message(FATAL_ERROR "OpenSSL NOT found: use `-DWITH_TLS=OFF` to build without TLS support") - endif() + endif(OPENSSL_FOUND) else() add_definitions(/DXMRIG_NO_TLS) -endif() +endif(WITH_TLS) if (WITH_LIBCPUID) add_subdirectory(src/3rdparty/libcpuid) @@ -162,8 +163,8 @@ else() set(SOURCES_CPUID src/Cpu_arm.cpp) else() set(SOURCES_CPUID src/Cpu_stub.cpp) - endif() -endif() + endif(XMRIG_ARM) +endif(WITH_LIBCPUID) CHECK_INCLUDE_FILE (syslog.h HAVE_SYSLOG_H) if (HAVE_SYSLOG_H) @@ -179,11 +180,11 @@ if (WITH_HTTPD) set(HTTPD_SOURCES src/api/Httpd.h src/api/Httpd.cpp) else() message(FATAL_ERROR "microhttpd NOT found: use `-DWITH_HTTPD=OFF` to build without http deamon support") - endif() + endif(MHD_FOUND) else() add_definitions(/DXMRIG_NO_HTTPD) add_definitions(/DXMRIG_NO_API) -endif() +endif(WITH_HTTPD) if (WITH_CC_SERVER) find_package(MHD) @@ -192,7 +193,7 @@ if (WITH_CC_SERVER) include_directories(${MHD_INCLUDE_DIRS}) else() message(FATAL_ERROR "microhttpd NOT found: use `-DWITH_CC_SERVER=OFF` to build without CC Server support") - endif() + endif(MHD_FOUND) set(SOURCES_CC_SERVER src/cc/CCServer.cpp @@ -201,12 +202,12 @@ if (WITH_CC_SERVER) src/cc/Httpd.cpp src/cc/XMRigCC.cpp ) -endif() +endif(WITH_CC_SERVER) if (WITH_CC_CLIENT) set(SOURCES_CC_CLIENT src/cc/CCClient.cpp) -endif() +endif(WITH_CC_CLIENT) if (WITH_CC_SERVER OR WITH_CC_CLIENT) set(SOURCES_CC_COMMON @@ -215,11 +216,34 @@ if (WITH_CC_SERVER OR WITH_CC_CLIENT) src/cc/GPUInfo.cpp) else() add_definitions(/DXMRIG_NO_CC) -endif() +endif(WITH_CC_SERVER OR WITH_CC_CLIENT) + +if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) + if (CMAKE_C_COMPILER_ID MATCHES MSVC) + enable_language(ASM_MASM) + set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.asm") + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) + else() + enable_language(ASM) + + if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) + set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop_win_gcc.S") + else() + set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S") + endif() + + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) + endif() + + add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE}) + set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C) +else() + add_definitions(/DXMRIG_NO_ASM) +endif(WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) if (BUILD_STATIC) set(CMAKE_EXE_LINKER_FLAGS " -static") -endif() +endif(BUILD_STATIC) include_directories(src) include_directories(src/3rdparty) @@ -246,12 +270,16 @@ target_link_libraries(xmrigMiner xmrig_common xmrig_os_dependencies xmrig_cpuid if (WITH_CC_CLIENT) target_link_libraries(xmrigMiner xmrig_cc_common) -endif (WITH_CC_CLIENT) +endif(WITH_CC_CLIENT) if (WITH_TLS) target_link_libraries(xmrigMiner xmrig_tls ${OPENSSL_LIBRARIES} ${EXTRA_LIBS}) target_link_libraries(xmrigMiner xmrig_tls ${OPENSSL_LIBRARIES} ${EXTRA_LIBS}) -endif (WITH_TLS) +endif(WITH_TLS) + +if (WITH_ASM) + target_link_libraries(xmrigMiner xmrig_asm) +endif(WITH_ASM) add_executable(xmrigDaemon src/cc/XMRigd.cpp res/app.rc) set_target_properties(xmrigDaemon PROPERTIES OUTPUT_NAME ${DAEMON_EXECUTABLE_NAME}) @@ -269,6 +297,6 @@ if (WITH_CC_SERVER AND MHD_FOUND) set_target_properties(xmrig_common_cc PROPERTIES COMPILE_FLAGS "-DXMRIG_CC_SERVER ${SHARED_FLAGS}") set_target_properties(xmrigCCServer PROPERTIES COMPILE_FLAGS "-DXMRIG_CC_SERVER ${SHARED_FLAGS}") -endif() +endif(WITH_CC_SERVER AND MHD_FOUND) add_subdirectory(test EXCLUDE_FROM_ALL) diff --git a/appveyor.yml b/appveyor.yml index 80ff01d3..5cfcf8e9 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -16,7 +16,7 @@ clone_folder: c:\xmrigCC install: - mkdir c:\xmrigCC-deps - - curl -sL https://github.com/Bendr0id/xmrigCC-deps/releases/download/v2/xmrigCC-deps.zip -o xmrigCC-deps.zip + - curl -sL https://github.com/Bendr0id/xmrigCC-deps/releases/download/v3/xmrigCC-deps.zip -o xmrigCC-deps.zip - 7z x xmrigCC-deps.zip -o"c:\xmrigCC-deps" -y > nul build_script: diff --git a/src/AsmOptimization.h b/src/AsmOptimization.h new file mode 100644 index 00000000..0662d305 --- /dev/null +++ b/src/AsmOptimization.h @@ -0,0 +1,89 @@ +/* XMRigCC + * Copyright 2018- BenDr0id + * + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef __ASM_OPTIMIZATION_H__ +#define __ASM_OPTIMIZATION_H__ + +#include +#include + +enum AsmOptimization +{ + ASM_AUTODETECT, + ASM_INTEL, + ASM_RYZEN, + ASM_NONE +}; + +inline std::string getAsmOptimizationName(AsmOptimization asmOptimization) +{ + switch (asmOptimization) + { + case ASM_INTEL: + return "INTEL"; + case ASM_RYZEN: + return "RYZEN"; + case ASM_NONE: + return "OFF"; + case ASM_AUTODETECT: + default: + return "-1"; + } +} + +inline AsmOptimization parseAsmOptimization(int optimization) +{ + AsmOptimization asmOptimization = AsmOptimization::ASM_AUTODETECT; + + switch (optimization) { + case -1: + asmOptimization = AsmOptimization::ASM_AUTODETECT; + break; + case 0: + asmOptimization = AsmOptimization::ASM_NONE; + break; + case 1: + asmOptimization = AsmOptimization::ASM_INTEL; + break; + case 2: + asmOptimization = AsmOptimization::ASM_RYZEN; + break; + default: + break; + } + + return asmOptimization; +} + +inline AsmOptimization parseAsmOptimization(const std::string optimization) +{ + AsmOptimization asmOptimization = AsmOptimization::ASM_AUTODETECT; + + if (optimization == "0" || optimization == "none" || optimization == "off") { + asmOptimization = AsmOptimization::ASM_NONE; + } else if (optimization == "1" || optimization == "intel") { + asmOptimization = AsmOptimization::ASM_INTEL; + } else if (optimization == "2" || optimization == "ryzen") { + asmOptimization = AsmOptimization::ASM_RYZEN; + } + + return asmOptimization; +} + + +#endif /* __ASM_OPTIMIZATION_H__ */ diff --git a/src/Cpu.cpp b/src/Cpu.cpp index 73fcdfb4..d5da6949 100644 --- a/src/Cpu.cpp +++ b/src/Cpu.cpp @@ -48,6 +48,7 @@ CpuImpl::CpuImpl() , m_sockets(1) , m_totalCores(0) , m_totalThreads(0) + , m_asmOptimization(AsmOptimization::ASM_NONE) { } @@ -86,9 +87,9 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor, if (threadsCount > maximumReasonableThreadCount) { threadsCount = maximumReasonableThreadCount; } - if (hashFactor > maximumReasonableFactor / threadsCount) { + if (threadsCount > 0 && hashFactor > maximumReasonableFactor / threadsCount) { hashFactor = std::min(maximumReasonableFactor / threadsCount, maximumReasonableHashFactor); - hashFactor = std::max(hashFactor, static_cast(1)); + hashFactor = std::max(hashFactor, static_cast(1)); } } @@ -106,9 +107,10 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor, } threadsCount = std::max(threadsCount, static_cast(1)); } + if (hashFactor == 0) { hashFactor = std::min(maximumReasonableHashFactor, maximumReasonableFactor / threadsCount); - hashFactor = std::max(hashFactor, static_cast(1)); + hashFactor = std::max(hashFactor, static_cast(1)); } } @@ -215,3 +217,8 @@ int Cpu::getAssignedCpuId(size_t threadId, int64_t affinityMask) return cpuId; } + +AsmOptimization Cpu::asmOptimization() +{ + return CpuImpl::instance().asmOptimization(); +} diff --git a/src/Cpu.h b/src/Cpu.h index a9161d67..4f8821d8 100644 --- a/src/Cpu.h +++ b/src/Cpu.h @@ -54,6 +54,7 @@ public: static size_t threads(); static size_t availableCache(); static int getAssignedCpuId(size_t threadId, int64_t affinityMask); + static AsmOptimization asmOptimization(); }; diff --git a/src/CpuImpl.h b/src/CpuImpl.h index b2bec265..56288f9a 100644 --- a/src/CpuImpl.h +++ b/src/CpuImpl.h @@ -51,6 +51,7 @@ public: size_t sockets() { return m_sockets; } size_t threads() { return m_totalThreads; } size_t availableCache(); + AsmOptimization asmOptimization() { return m_asmOptimization; } private: void initCommon(); @@ -63,6 +64,7 @@ private: size_t m_sockets; size_t m_totalCores; size_t m_totalThreads; + AsmOptimization m_asmOptimization; }; #endif /* __CPU_IMPL_H__ */ diff --git a/src/Cpu_cpuid.cpp b/src/Cpu_cpuid.cpp index 6251a97e..b701a994 100644 --- a/src/Cpu_cpuid.cpp +++ b/src/Cpu_cpuid.cpp @@ -80,4 +80,15 @@ void CpuImpl::initCommon() if (data.flags[CPU_FEATURE_BMI2]) { m_flags |= Cpu::BMI2; } + +# ifndef XMRIG_NO_ASM + if (data.vendor == VENDOR_AMD && data.ext_family >= 0x17) { + m_asmOptimization = AsmOptimization::ASM_RYZEN; + } else if (data.vendor == VENDOR_INTEL && + ((data.ext_family >= 0x06 && data.ext_model > 0x2) || + (data.ext_family >= 0x06 && data.ext_model == 0x2 && data.model >= 0xA))) { + m_asmOptimization = AsmOptimization::ASM_INTEL; + } +# endif + } diff --git a/src/Options.cpp b/src/Options.cpp index 6a69fade..dd373093 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -73,8 +73,9 @@ Options:\n" -k, --keepalive send keepalived for prevent timeout (need pool support)\n\ -r, --retries=N number of times to retry before switch to backup server (default: 5)\n\ -R, --retry-pause=N time to pause between retries (default: 5)\n\ - --pow-variant=V specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), ipbc (tube), alloy, xtl (including autodetect for v5)\n\ + --pow-variant=V specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka cnv7), 2(v2, aka cnv8), ipbc (tube), alloy, xtl (including autodetect for v5)\n\ for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\ + --asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'none' \n\ --multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\ --multihash-thread-mask=MASK limits multihash to given threads (mask), (default: all threads)\n\ --cpu-affinity set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\ @@ -166,7 +167,7 @@ static struct option const options[] = { { "userpass", 1, nullptr, 'O' }, { "version", 0, nullptr, 'V' }, { "use-tls", 0, nullptr, 1015 }, - { "force-pow-version",1, nullptr, 1016 }, + { "multihash-thread-mask", 1, nullptr, 4013 }, { "pow-variant" ,1, nullptr, 1017 }, { "api-port", 1, nullptr, 4000 }, { "api-access-token", 1, nullptr, 4001 }, @@ -189,6 +190,7 @@ static struct option const options[] = { { "daemonized", 0, nullptr, 4011 }, { "doublehash-thread-mask", 1, nullptr, 4013 }, { "multihash-thread-mask", 1, nullptr, 4013 }, + { "asm-optimization", 1, nullptr, 4020 }, { nullptr, 0, nullptr, 0 } }; @@ -217,6 +219,7 @@ static struct option const config_options[] = { { "pow-variant", 1, nullptr, 1017 }, { "doublehash-thread-mask", 1, nullptr, 4013 }, { "multihash-thread-mask", 1, nullptr, 4013 }, + { "asm-optimization", 1, nullptr, 4020 }, { nullptr, 0, nullptr, 0 } }; @@ -282,6 +285,7 @@ constexpr static const char *pow_variant_names[] = { "auto", "0", "1", + "2", "tube", "alloy", "xtl", @@ -290,6 +294,13 @@ constexpr static const char *pow_variant_names[] = { "rto" }; +constexpr static const char *asm_optimization_names[] = { + "auto", + "intel", + "ryzen", + "none" +}; + Options *Options::parse(int argc, char **argv) { auto options = new Options(argc, argv); @@ -342,6 +353,7 @@ Options::Options(int argc, char **argv) : m_algoVariant(AV0_AUTO), m_aesni(AESNI_AUTO), m_powVariant(POW_AUTODETECT), + m_asmOptimization(ASM_AUTODETECT), m_hashFactor(0), m_apiPort(0), m_donateLevel(kDonateLevel), @@ -400,6 +412,10 @@ Options::Options(int argc, char **argv) : optimizeAlgorithmConfiguration(); + if (m_asmOptimization == AsmOptimization::ASM_AUTODETECT) { + m_asmOptimization = Cpu::asmOptimization(); + } + for (Url *url : m_pools) { url->applyExceptions(); } @@ -588,6 +604,9 @@ bool Options::parseArg(int key, const char *arg) case 4019: /* --cc-upload-config-on-startup */ return parseBoolean(key, true); + case 4020: /* --asm-optimization */ + return parseAsmOptimization(arg); + case 't': /* --threads */ if (strncmp(arg, "all", 3) == 0) { m_threads = Cpu::threads(); @@ -1015,11 +1034,16 @@ bool Options::parsePowVariant(const char *powVariant) break; } - if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "monerov7") || !strcmp(powVariant, "aeonv7") || !strcmp(powVariant, "v7"))) { + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "cnv1") || !strcmp(powVariant, "monerov7") || !strcmp(powVariant, "aeonv7") || !strcmp(powVariant, "v7"))) { m_powVariant = POW_V1; break; } + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "cnv2") || !strcmp(powVariant, "monerov8") || !strcmp(powVariant, "aeonv8") || !strcmp(powVariant, "v8"))) { + m_powVariant = POW_V2; + break; + } + if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "stellite")) { m_powVariant = POW_XTL; break; @@ -1049,6 +1073,25 @@ bool Options::parsePowVariant(const char *powVariant) return true; } + +bool Options::parseAsmOptimization(const char *asmOptimization) +{ + for (size_t i = 0; i < ARRAY_SIZE(pow_variant_names); i++) { + if (pow_variant_names[i] && !strcmp(asmOptimization, asm_optimization_names[i])) { + m_asmOptimization = static_cast(i); + break; + } + + if (i == ARRAY_SIZE(asm_optimization_names) - 1) { + showUsage(1); + return false; + } + } + + return true; +} + + void Options::optimizeAlgorithmConfiguration() { // backwards compatibility for configs still setting algo variant (av) @@ -1123,5 +1166,3 @@ bool Options::parseCCUrl(const char* url) return true; } - - diff --git a/src/Options.h b/src/Options.h index 39d26ffd..31a167fe 100644 --- a/src/Options.h +++ b/src/Options.h @@ -34,6 +34,7 @@ #include "rapidjson/fwd.h" #include "PowVariant.h" +#include "AsmOptimization.h" class Url; struct option; @@ -91,6 +92,7 @@ public: inline const std::vector &pools() const { return m_pools; } inline Algo algo() const { return m_algo; } inline PowVariant powVariant() const { return m_powVariant; } + inline AsmOptimization asmOptimization() const { return m_asmOptimization; } inline bool aesni() const { return m_aesni == AESNI_ON; } inline size_t hashFactor() const { return m_hashFactor; } inline int apiPort() const { return m_apiPort; } @@ -136,6 +138,7 @@ private: bool setAlgo(const char *algo); bool parsePowVariant(const char *powVariant); + bool parseAsmOptimization(const char *arg); void optimizeAlgorithmConfiguration(); @@ -167,6 +170,7 @@ private: AlgoVariant m_algoVariant; AesNi m_aesni; PowVariant m_powVariant; + AsmOptimization m_asmOptimization; size_t m_hashFactor; int m_apiPort; int m_donateLevel; diff --git a/src/PowVariant.h b/src/PowVariant.h index fc20c02a..0bde83d6 100644 --- a/src/PowVariant.h +++ b/src/PowVariant.h @@ -27,6 +27,7 @@ enum PowVariant POW_AUTODETECT, POW_V0, POW_V1, + POW_V2, POW_TUBE, POW_ALLOY, POW_XTL, @@ -44,6 +45,8 @@ inline std::string getPowVariantName(PowVariant powVariant) return "0"; case POW_V1: return "1"; + case POW_V2: + return "2"; case POW_TUBE: return "tube"; case POW_ALLOY: @@ -88,6 +91,9 @@ inline PowVariant parseVariant(int variant) case 1: powVariant = PowVariant::POW_V1; break; + case 2: + powVariant = PowVariant::POW_V2; + break; default: break; } @@ -104,6 +110,8 @@ inline PowVariant parseVariant(const std::string variant) powVariant = PowVariant::POW_V0; } else if (variant == "1") { powVariant = PowVariant::POW_V1; + } else if (variant == "2") { + powVariant = PowVariant::POW_V2; } else if (variant == "ipbc" || variant == "tube" || variant == "bittube") { powVariant = PowVariant::POW_TUBE; } else if (variant == "xao" || variant == "alloy") { diff --git a/src/Summary.cpp b/src/Summary.cpp index cfad1e14..c623d4a0 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -59,17 +59,21 @@ static void print_versions() static void print_cpu() { if (Options::i()->colors()) { - Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU: %s (%d) %sx64 %sAES-NI", + Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU: %s (%d) %sx64 %sAES-NI %sASM-%s", Cpu::brand(), Cpu::sockets(), Cpu::isX64() ? "\x1B[01;32m" : "\x1B[01;31m-", - Cpu::hasAES() ? "\x1B[01;32m" : "\x1B[01;31m-"); + Cpu::hasAES() ? "\x1B[01;32m" : "\x1B[01;31m-", + Options::i()->asmOptimization() != AsmOptimization::ASM_NONE ? "\x1B[01;32m" : "\x1B[01;31m", + getAsmOptimizationName(Options::i()->asmOptimization()).c_str()); # ifndef XMRIG_NO_LIBCPUID Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU L2/L3: %.1f MB/%.1f MB", Cpu::l2() / 1024.0, Cpu::l3() / 1024.0); # endif } else { - Log::i()->text(" * CPU: %s (%d) %sx64 %sAES-NI", Cpu::brand(), Cpu::sockets(), Cpu::isX64() ? "" : "-", Cpu::hasAES() ? "" : "-"); + Log::i()->text(" * CPU: %s (%d) %sx64 %sAES-NI ASM-%s", + Cpu::brand(), Cpu::sockets(), Cpu::isX64() ? "" : "-", Cpu::hasAES() ? "" : "-", + getAsmOptimizationName(Options::i()->asmOptimization()).c_str()); # ifndef XMRIG_NO_LIBCPUID Log::i()->text(" * CPU L2/L3: %.1f MB/%.1f MB", Cpu::l2() / 1024.0, Cpu::l3() / 1024.0); # endif diff --git a/src/config.json b/src/config.json index f777770c..760cf82a 100644 --- a/src/config.json +++ b/src/config.json @@ -4,8 +4,9 @@ "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) - "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto + "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations + "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, none "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) "colors": true, // false to disable colored output "cpu-affinity": null, // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1 diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index bf9b0b08..374eca45 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -34,28 +34,64 @@ #include "crypto/CryptoNight_test.h" template -static void cryptonight_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) if (powVersion == PowVariant::POW_V1) { +#if defined(XMRIG_ARM) CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); - } else if (powVersion == PowVariant::POW_ALLOY) { - CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); - } else if (powVersion == PowVariant::POW_XTL) { - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); - } else if (powVersion == PowVariant::POW_MSR) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); - } else if (powVersion == PowVariant::POW_RTO) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); - }else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); - } +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else if (powVersion == PowVariant::POW_V2) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1)) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif +} else if (powVersion == PowVariant::POW_ALLOY) { + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); +} else if (powVersion == PowVariant::POW_XTL) { + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +} else if (powVersion == PowVariant::POW_MSR) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +} else if (powVersion == PowVariant::POW_RTO) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); +}else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); +} # endif } template -static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { if (powVersion == PowVariant::POW_V1) { +#if defined(XMRIG_ARM) CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else if (powVersion == PowVariant::POW_V2) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif } else if (powVersion == PowVariant::POW_ALLOY) { CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_XTL) { @@ -70,7 +106,7 @@ static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, siz } template -static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) if (powVersion == PowVariant::POW_V1) { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); @@ -83,7 +119,7 @@ static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input, } template -static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { if (powVersion == PowVariant::POW_V1) { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_TUBE) { @@ -94,7 +130,7 @@ static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input } template -static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) if (powVersion == PowVariant::POW_XHV) { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); @@ -109,7 +145,7 @@ static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input, } template -static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_heavy_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { if (powVersion == PowVariant::POW_XHV) { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); } @@ -121,7 +157,7 @@ static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* inpu } } -void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad); +void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad); template void setCryptoNightHashMethods(Options::Algo algo, bool aesni) @@ -163,13 +199,19 @@ void setCryptoNightHashMethods<0>(Options::Algo algo, bool aesni) bool CryptoNight::init(int algo, bool aesni) { + for (int i = 0; i < 256; ++i) + { + const uint64_t index = (((i >> 3) & 6) | (i & 1)) << 1; + variant1_table[i] = i ^ ((0x75310 >> index) & 0x30); + } + setCryptoNightHashMethods(static_cast(algo), aesni); return selfTest(algo); } -void CryptoNight::hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) +void CryptoNight::hash(size_t factor, AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { - cryptonight_hash_ctx[factor-1](powVersion, input, size, output, scratchPad); + cryptonight_hash_ctx[factor-1](asmOptimization, powVersion, input, size, output, scratchPad); } bool CryptoNight::selfTest(int algo) @@ -206,203 +248,231 @@ bool CryptoNight::selfTest(int algo) bool resultLite = true; bool resultHeavy = true; + AsmOptimization asmOptimization = Options::i()->asmOptimization(); + if (algo == Options::ALGO_CRYPTONIGHT_HEAVY) { // cn-heavy - cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 96) == 0; #endif // cn-heavy haven - cryptonight_hash_ctx[0](PowVariant::POW_XHV, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_XHV, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_XHV, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 96) == 0; #endif // cn-heavy bittube - cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 96) == 0; #endif + } else if (algo == Options::ALGO_CRYPTONIGHT_LITE) { // cn-lite v0 - cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 160) == 0; #endif // cn-lite v7 tests - cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 160) == 0; #endif // cn-lite ibpc tests - cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 160) == 0; #endif } else { - // cn v0 + // cn v0 aka orignal - cryptonight_hash_ctx[0](PowVariant::POW_V0,test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 160) == 0; #endif - // cn v7 + // cn v7 aka cnv1 - cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 160) == 0; #endif - // cn xtl + // cn v7 + xtl - cryptonight_hash_ctx[0](PowVariant::POW_XTL,test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_XTL, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_XTL, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_XTL, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_XTL, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 160) == 0; #endif + + // cn v8 aka cnv2 + + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v2, 32) == 0; + + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v2, 64) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 2 + cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v2, 96) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 3 + cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v2, 128) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 4 + cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v2, 160) == 0; + #endif } for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) { diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h index 753c56fc..1275d669 100644 --- a/src/crypto/CryptoNight.h +++ b/src/crypto/CryptoNight.h @@ -25,9 +25,10 @@ #define __CRYPTONIGHT_H__ -#include -#include +#include +#include +#include "AsmOptimization.h" #include "Options.h" #define MEMORY 2097152 /* 2 MiB */ @@ -38,10 +39,17 @@ #define POW_XLT_V4_INDEX_SHIFT 4 struct ScratchPad { - alignas(16) uint8_t state[208]; // 208 instead of 200 to maintain aligned to 16 byte boundaries + alignas(16) uint8_t state[224]; // 224 instead of 200 to maintain aligned to 16 byte boundaries alignas(16) uint8_t* memory; + + // Additional stuff for asm impl + uint8_t ctx_info[24]; + const void* input; + uint8_t* variant1_table; + const uint32_t* t_fn; }; +alignas(64) static uint8_t variant1_table[256]; class Job; class JobResult; @@ -50,8 +58,9 @@ class CryptoNight { public: static bool init(int algo, bool aesni); + static void hash(size_t factor, AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads); - static void hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads); +public: private: static bool selfTest(int algo); diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 377c0002..9df769fb 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -36,6 +36,7 @@ #endif +#include #include #include "crypto/CryptoNight.h" @@ -110,6 +111,44 @@ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i #define EXTRACT64(X) _mm_cvtsi128_si64(X) +# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax) \ +{ \ + const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x10))); \ + const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x20))); \ + const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x30))); \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx1))); \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx0))); \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(ax))); \ +} + +# define INTEGER_MATH_V2(idx, cl, cx) \ +{ \ + const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \ + cl ^= division_result_xmm##idx ^ (sqrt_result##idx << 32); \ + const uint32_t d = static_cast(cx_0 + (sqrt_result##idx << 1)) | 0x80000001UL; \ + const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + division_result_xmm##idx = static_cast(cx_1 / d) + ((cx_1 % d) << 32); \ + const uint64_t sqrt_input = cx_0 + division_result_xmm##idx; \ + sqrt_result##idx = sqrt(sqrt_input + 18446744073709551616.0) * 2.0 - 8589934592.0; \ + const uint64_t s = sqrt_result##idx >> 1; \ + const uint64_t b = sqrt_result##idx & 1; \ + const uint64_t r2 = (uint64_t)(s) * (s + b) + (sqrt_result##idx << 32); \ + sqrt_result##idx += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \ +} + +# define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi) \ +{ \ + const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \ + const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x20))); \ + const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x30))); \ + hi ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[0]; \ + lo ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[1]; \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx1))); \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx0))); \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(ax))); \ +} + + #if defined (__arm64__) || defined (__aarch64__) static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi) { @@ -121,23 +160,17 @@ static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi) static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) { - // multiplier = ab = a * 2^32 + b - // multiplicand = cd = c * 2^32 + d - // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d uint64_t a = multiplier >> 32; uint64_t b = multiplier & 0xFFFFFFFF; uint64_t c = multiplicand >> 32; uint64_t d = multiplicand & 0xFFFFFFFF; - //uint64_t ac = a * c; uint64_t ad = a * d; - //uint64_t bc = b * c; uint64_t bd = b * d; uint64_t adbc = ad + (b * c); uint64_t adbc_carry = adbc < ad ? 1 : 0; - // multiplier * multiplicand = product_hi * 2^64 + product_lo uint64_t product_lo = bd + (adbc << 32); uint64_t product_lo_carry = product_lo < bd ? 1 : 0; *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry; @@ -621,8 +654,10 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, @@ -643,23 +678,27 @@ public: for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); } + } + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); + _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } - uint64_t hi, lo, cl, ch; + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; lo = __umul128(idx[hashBlock], cl, &hi); @@ -673,6 +712,8 @@ public: ah[hashBlock] ^= ch; al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; + + bx[hashBlock] = cx[hashBlock]; } } @@ -693,9 +734,11 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; uint64_t tweak1_2[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, @@ -712,37 +755,42 @@ public: al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = - _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; } for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); } + } - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cl = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1]; lo = __umul128(idx[hashBlock], cl, &hi); al[hashBlock] += hi; @@ -750,14 +798,123 @@ public: ah[hashBlock] ^= tweak1_2[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; ah[hashBlock] ^= tweak1_2[hashBlock]; ah[hashBlock] ^= ch; al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; + + bx[hashBlock] = cx[hashBlock]; + } + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); + } + } + + // multi + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t sqrt_result[NUM_HASH_BLOCKS]; + uint64_t division_result_xmm[NUM_HASH_BLOCKS]; + __m128i bx0[NUM_HASH_BLOCKS]; + __m128i bx1[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, + scratchPad[hashBlock]->state, 200); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); + + cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx0[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + bx1[hashBlock] = _mm_set_epi64x(h[hashBlock][9] ^ h[hashBlock][11], h[hashBlock][8] ^ h[hashBlock][10]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + + division_result_xmm[hashBlock] = h[hashBlock][12]; + sqrt_result[hashBlock] = h[hashBlock][13]; + } + + uint64_t sqrt_result0; + uint64_t division_result_xmm0; + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); + + if (SOFT_AES) { + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); + } else { + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); + } + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + SHUFFLE_PHASE_1(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock]) + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx0[hashBlock], cx[hashBlock])); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } + + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + + sqrt_result0 = sqrt_result[hashBlock]; + division_result_xmm0 = division_result_xmm[hashBlock]; + + INTEGER_MATH_V2(0, cl, cx[hashBlock]) + + sqrt_result[hashBlock] = sqrt_result0; + division_result_xmm[hashBlock] = division_result_xmm0; + + lo = __umul128(idx[hashBlock], cl, &hi); + + SHUFFLE_PHASE_2(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock], lo, hi) + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; + + bx1[hashBlock] = bx0[hashBlock]; + bx0[hashBlock] = cx[hashBlock]; } } @@ -1271,6 +1428,79 @@ public: extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } + // single + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l; + uint64_t* h; + uint64_t al; + uint64_t ah; + uint64_t idx; + __m128i bx0; + __m128i bx1; + + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + + al = h[0] ^ h[4]; + ah = h[1] ^ h[5]; + bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]); + idx = h[0] ^ h[4]; + + uint64_t division_result_xmm0 = h[12]; + uint64_t sqrt_result0 = h[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + const __m128i ax = _mm_set_epi64x(ah, al); + + __m128i cx; + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*) &l[idx & MASK], ax); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, ax); + } + + SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax) + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx)); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx) + + lo = __umul128(idx, cl, &hi); + + SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi) + + al += hi; + ah += lo; + + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + + ah ^= ch; + al ^= cl; + idx = al; + + bx0 = cx; + } + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -1580,6 +1810,7 @@ public: } }; + template class CryptoNightMultiHash { @@ -1783,6 +2014,128 @@ public: extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } + // double + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + uint64_t division_result_xmm0 = h0[12]; + uint64_t division_result_xmm1 = h1[12]; + + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + __m128i cx0; + __m128i cx1; + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + } + + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx0); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + INTEGER_MATH_V2(1, cl, cx1); + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -2565,6 +2918,172 @@ public: extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } + // triple + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + uint64_t division_result_xmm0 = h0[12]; + uint64_t division_result_xmm1 = h1[12]; + uint64_t division_result_xmm2 = h2[12]; + + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; + uint64_t sqrt_result2 = h2[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + } + + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx0); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + INTEGER_MATH_V2(1, cl, cx1); + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + INTEGER_MATH_V2(2, cl, cx2); + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi) + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -3617,6 +4136,217 @@ public: extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); } + // quadruple + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + uint64_t division_result_xmm0 = h0[12]; + uint64_t division_result_xmm1 = h1[12]; + uint64_t division_result_xmm2 = h2[12]; + uint64_t division_result_xmm3 = h3[12]; + + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; + uint64_t sqrt_result2 = h2[13]; + uint64_t sqrt_result3 = h3[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], ax3); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + } + + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx0); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + INTEGER_MATH_V2(1, cl, cx1); + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + INTEGER_MATH_V2(2, cl, cx2); + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + + INTEGER_MATH_V2(3, cl, cx3); + + lo = __umul128(idx3, cl, &hi); + + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -4265,6 +4995,262 @@ public: extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); } + // quintuple + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx04 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + __m128i bx14 = _mm_set_epi64x(h4[9] ^ h4[11], h4[8] ^ h4[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + uint64_t division_result_xmm0 = h0[12]; + uint64_t division_result_xmm1 = h1[12]; + uint64_t division_result_xmm2 = h2[12]; + uint64_t division_result_xmm3 = h3[12]; + uint64_t division_result_xmm4 = h4[12]; + + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; + uint64_t sqrt_result2 = h2[13]; + uint64_t sqrt_result3 = h3[13]; + uint64_t sqrt_result4 = h4[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + const __m128i ax4 = _mm_set_epi64x(ah4, al4); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], ax3); + cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], ax4); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + cx4 = _mm_aesenc_si128(cx4, ax4); + } + + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) + SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx04, cx4)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx0); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + INTEGER_MATH_V2(1, cl, cx1); + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + INTEGER_MATH_V2(2, cl, cx2); + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + + INTEGER_MATH_V2(3, cl, cx3); + + lo = __umul128(idx3, cl, &hi); + + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; + + + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; + + INTEGER_MATH_V2(4, cl, cx4); + + lo = __umul128(idx4, cl, &hi); + + SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi); + + al4 += hi; + ah4 += lo; + + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; + + bx14 = bx04; + bx04 = cx4; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 0aee57b3..2952f140 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -26,155 +26,169 @@ #define __CRYPTONIGHT_TEST_H__ const static uint8_t test_input[380] = { - 0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00, - 0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B, - 0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62, - 0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92, - 0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01, - 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, - 0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, - 0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, - 0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46, - 0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02, - 0x07, 0x07, 0xB4, 0x87, 0xD0, 0xD6, 0x05, 0x26, 0xE0, 0xC6, 0xDD, 0x9B, 0xC7, 0x18, 0xC3, 0xCF, - 0x52, 0x04, 0xBD, 0x4F, 0x9B, 0x27, 0xF6, 0x73, 0xB9, 0x3F, 0xEF, 0x7B, 0xB2, 0xF7, 0x2B, 0xBB, - 0x3F, 0x3E, 0x9C, 0x3E, 0x9D, 0x33, 0x1E, 0xDE, 0xAD, 0xBE, 0xEF, 0x4E, 0x00, 0x91, 0x81, 0x29, - 0x74, 0xB2, 0x70, 0xE7, 0x6D, 0xD2, 0x2A, 0x5F, 0x52, 0x04, 0x93, 0xE6, 0x18, 0x89, 0x40, 0xD8, - 0xC6, 0xE3, 0x90, 0x6E, 0xAA, 0x6A, 0xB7, 0xE2, 0x08, 0x7E, 0x78, 0x0E, - 0x01, 0x00, 0xEE, 0xB2, 0xD1, 0xD6, 0x05, 0xFF, 0x27, 0x7F, 0x26, 0xDB, 0xAA, 0xB2, 0xC9, 0x26, - 0x30, 0xC6, 0xCF, 0x11, 0x64, 0xEA, 0x6C, 0x8A, 0xE0, 0x98, 0x01, 0xF8, 0x75, 0x4B, 0x49, 0xAF, - 0x79, 0x70, 0xAE, 0xEE, 0xA7, 0x62, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x47, 0x8C, 0x63, 0xE7, 0xD8, - 0x40, 0x02, 0x3C, 0xDA, 0xEA, 0x92, 0x52, 0x53, 0xAC, 0xFD, 0xC7, 0x8A, 0x4C, 0x31, 0xB2, 0xF2, - 0xEC, 0x72, 0x7B, 0xFF, 0xCE, 0xC0, 0xE7, 0x12, 0xD4, 0xE9, 0x2A, 0x01, - 0x07, 0x07, 0xA9, 0xB7, 0xD1, 0xD6, 0x05, 0x3F, 0x0D, 0x5E, 0xFD, 0xC7, 0x03, 0xFC, 0xFC, 0xD2, - 0xCE, 0xBC, 0x44, 0xD8, 0xAB, 0x44, 0xA6, 0xA0, 0x3A, 0xE4, 0x4D, 0x8F, 0x15, 0xAF, 0x62, 0x17, - 0xD1, 0xE0, 0x92, 0x85, 0xE4, 0x73, 0xF9, 0x00, 0x00, 0x00, 0xA0, 0xFC, 0x09, 0xDE, 0xAB, 0xF5, - 0x8B, 0x6F, 0x1D, 0xCA, 0xA8, 0xBA, 0xAC, 0x74, 0xDD, 0x74, 0x19, 0xD5, 0xD6, 0x10, 0xEC, 0x38, - 0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04 + 0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00, + 0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B, + 0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62, + 0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92, + 0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01, + 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, + 0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, + 0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, + 0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46, + 0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02, + 0x07, 0x07, 0xB4, 0x87, 0xD0, 0xD6, 0x05, 0x26, 0xE0, 0xC6, 0xDD, 0x9B, 0xC7, 0x18, 0xC3, 0xCF, + 0x52, 0x04, 0xBD, 0x4F, 0x9B, 0x27, 0xF6, 0x73, 0xB9, 0x3F, 0xEF, 0x7B, 0xB2, 0xF7, 0x2B, 0xBB, + 0x3F, 0x3E, 0x9C, 0x3E, 0x9D, 0x33, 0x1E, 0xDE, 0xAD, 0xBE, 0xEF, 0x4E, 0x00, 0x91, 0x81, 0x29, + 0x74, 0xB2, 0x70, 0xE7, 0x6D, 0xD2, 0x2A, 0x5F, 0x52, 0x04, 0x93, 0xE6, 0x18, 0x89, 0x40, 0xD8, + 0xC6, 0xE3, 0x90, 0x6E, 0xAA, 0x6A, 0xB7, 0xE2, 0x08, 0x7E, 0x78, 0x0E, + 0x01, 0x00, 0xEE, 0xB2, 0xD1, 0xD6, 0x05, 0xFF, 0x27, 0x7F, 0x26, 0xDB, 0xAA, 0xB2, 0xC9, 0x26, + 0x30, 0xC6, 0xCF, 0x11, 0x64, 0xEA, 0x6C, 0x8A, 0xE0, 0x98, 0x01, 0xF8, 0x75, 0x4B, 0x49, 0xAF, + 0x79, 0x70, 0xAE, 0xEE, 0xA7, 0x62, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x47, 0x8C, 0x63, 0xE7, 0xD8, + 0x40, 0x02, 0x3C, 0xDA, 0xEA, 0x92, 0x52, 0x53, 0xAC, 0xFD, 0xC7, 0x8A, 0x4C, 0x31, 0xB2, 0xF2, + 0xEC, 0x72, 0x7B, 0xFF, 0xCE, 0xC0, 0xE7, 0x12, 0xD4, 0xE9, 0x2A, 0x01, + 0x07, 0x07, 0xA9, 0xB7, 0xD1, 0xD6, 0x05, 0x3F, 0x0D, 0x5E, 0xFD, 0xC7, 0x03, 0xFC, 0xFC, 0xD2, + 0xCE, 0xBC, 0x44, 0xD8, 0xAB, 0x44, 0xA6, 0xA0, 0x3A, 0xE4, 0x4D, 0x8F, 0x15, 0xAF, 0x62, 0x17, + 0xD1, 0xE0, 0x92, 0x85, 0xE4, 0x73, 0xF9, 0x00, 0x00, 0x00, 0xA0, 0xFC, 0x09, 0xDE, 0xAB, 0xF5, + 0x8B, 0x6F, 0x1D, 0xCA, 0xA8, 0xBA, 0xAC, 0x74, 0xDD, 0x74, 0x19, 0xD5, 0xD6, 0x10, 0xEC, 0x38, + 0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04 }; // CN const static uint8_t test_output_v0[160] = { - 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, - 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, - 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, - 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, - 0xA1, 0xB4, 0xFA, 0xE3, 0xE5, 0x76, 0xCE, 0xCF, 0xB7, 0x9C, 0xAF, 0x3E, 0x29, 0x92, 0xE4, 0xE0, - 0x31, 0x24, 0x05, 0x48, 0xBF, 0x8D, 0x5F, 0x7B, 0x11, 0x03, 0x60, 0xAA, 0xD7, 0x50, 0x3F, 0x0C, - 0x2D, 0x30, 0xF3, 0x87, 0x4F, 0x86, 0xA1, 0x4A, 0xB5, 0xA2, 0x1A, 0x08, 0xD0, 0x44, 0x2C, 0x9D, - 0x16, 0xE9, 0x28, 0x49, 0xA1, 0xFF, 0x85, 0x6F, 0x12, 0xBB, 0x7D, 0xAB, 0x11, 0x1C, 0xE7, 0xF7, - 0x2D, 0x9D, 0x19, 0xE4, 0xD2, 0x26, 0x44, 0x1E, 0xCD, 0x22, 0x08, 0x24, 0xA8, 0x97, 0x46, 0x62, - 0x04, 0x84, 0x90, 0x4A, 0xEE, 0x99, 0x14, 0xED, 0xB8, 0xC6, 0x0D, 0x37, 0xA1, 0x66, 0x17, 0xB0 + 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, + 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, + 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, + 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, + 0xA1, 0xB4, 0xFA, 0xE3, 0xE5, 0x76, 0xCE, 0xCF, 0xB7, 0x9C, 0xAF, 0x3E, 0x29, 0x92, 0xE4, 0xE0, + 0x31, 0x24, 0x05, 0x48, 0xBF, 0x8D, 0x5F, 0x7B, 0x11, 0x03, 0x60, 0xAA, 0xD7, 0x50, 0x3F, 0x0C, + 0x2D, 0x30, 0xF3, 0x87, 0x4F, 0x86, 0xA1, 0x4A, 0xB5, 0xA2, 0x1A, 0x08, 0xD0, 0x44, 0x2C, 0x9D, + 0x16, 0xE9, 0x28, 0x49, 0xA1, 0xFF, 0x85, 0x6F, 0x12, 0xBB, 0x7D, 0xAB, 0x11, 0x1C, 0xE7, 0xF7, + 0x2D, 0x9D, 0x19, 0xE4, 0xD2, 0x26, 0x44, 0x1E, 0xCD, 0x22, 0x08, 0x24, 0xA8, 0x97, 0x46, 0x62, + 0x04, 0x84, 0x90, 0x4A, 0xEE, 0x99, 0x14, 0xED, 0xB8, 0xC6, 0x0D, 0x37, 0xA1, 0x66, 0x17, 0xB0 }; // CN v7 const static uint8_t test_output_v1[160] = { - 0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, - 0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9, - 0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D, - 0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22, - 0xE7, 0x8C, 0x5A, 0x6E, 0x38, 0x30, 0x68, 0x4A, 0x73, 0xFC, 0x1B, 0xC6, 0x6D, 0xFC, 0x8D, 0x98, - 0xB4, 0xC2, 0x23, 0x39, 0xAD, 0xE0, 0x9D, 0xF6, 0x6D, 0x8C, 0x6A, 0xAA, 0xF9, 0xB2, 0xE3, 0x4C, - 0xB6, 0x90, 0x6C, 0xE6, 0x15, 0x5E, 0x46, 0x07, 0x9C, 0xB2, 0x6B, 0xAC, 0x3B, 0xAC, 0x1A, 0xDE, - 0x92, 0x2C, 0xD6, 0x0C, 0x46, 0x9D, 0x9B, 0xC2, 0x84, 0x52, 0x65, 0xF6, 0xBD, 0xFA, 0x0D, 0x74, - 0x00, 0x66, 0x10, 0x07, 0xF1, 0x19, 0x06, 0x3A, 0x6C, 0xFF, 0xEE, 0xB2, 0x40, 0xE5, 0x88, 0x2B, - 0x6C, 0xAB, 0x6B, 0x1D, 0x88, 0xB8, 0x44, 0x25, 0xF4, 0xEA, 0xB7, 0xEC, 0xBA, 0x12, 0x8A, 0x24 + 0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, + 0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9, + 0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D, + 0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22, + 0xE7, 0x8C, 0x5A, 0x6E, 0x38, 0x30, 0x68, 0x4A, 0x73, 0xFC, 0x1B, 0xC6, 0x6D, 0xFC, 0x8D, 0x98, + 0xB4, 0xC2, 0x23, 0x39, 0xAD, 0xE0, 0x9D, 0xF6, 0x6D, 0x8C, 0x6A, 0xAA, 0xF9, 0xB2, 0xE3, 0x4C, + 0xB6, 0x90, 0x6C, 0xE6, 0x15, 0x5E, 0x46, 0x07, 0x9C, 0xB2, 0x6B, 0xAC, 0x3B, 0xAC, 0x1A, 0xDE, + 0x92, 0x2C, 0xD6, 0x0C, 0x46, 0x9D, 0x9B, 0xC2, 0x84, 0x52, 0x65, 0xF6, 0xBD, 0xFA, 0x0D, 0x74, + 0x00, 0x66, 0x10, 0x07, 0xF1, 0x19, 0x06, 0x3A, 0x6C, 0xFF, 0xEE, 0xB2, 0x40, 0xE5, 0x88, 0x2B, + 0x6C, 0xAB, 0x6B, 0x1D, 0x88, 0xB8, 0x44, 0x25, 0xF4, 0xEA, 0xB7, 0xEC, 0xBA, 0x12, 0x8A, 0x24 }; +// CN V8 +const static uint8_t test_output_v2[160] = { + 0x97, 0x37, 0x82, 0x82, 0xcf, 0x10, 0xe7, 0xad, 0x03, 0x3f, 0x7b, 0x80, 0x74, 0xc4, 0x0e, 0x14, + 0xd0, 0x6e, 0x7f, 0x60, 0x9d, 0xdd, 0xda, 0x78, 0x76, 0x80, 0xb5, 0x8c, 0x05, 0xf4, 0x3d, 0x21, + 0x87, 0x1f, 0xcd, 0x68, 0x23, 0xf6, 0xa8, 0x79, 0xbb, 0x3f, 0x33, 0x95, 0x1c, 0x8e, 0x8e, 0x89, + 0x1d, 0x40, 0x43, 0x88, 0x0b, 0x02, 0xdf, 0xa1, 0xbb, 0x3b, 0xe4, 0x98, 0xb5, 0x0e, 0x75, 0x78, + 0xe6, 0x0d, 0x24, 0x0f, 0x65, 0x85, 0x60, 0x3a, 0x4a, 0xe5, 0x5f, 0x54, 0x9b, 0xc8, 0x79, 0x93, + 0xeb, 0x3d, 0x98, 0x2c, 0xfe, 0x9b, 0xfb, 0x15, 0xb6, 0x88, 0x21, 0x94, 0xb0, 0x05, 0x86, 0x5c, + 0x59, 0x8b, 0x93, 0x7a, 0xda, 0xd2, 0xa2, 0x14, 0xed, 0xb7, 0xc4, 0x5d, 0xa1, 0xef, 0x26, 0xf3, + 0xc7, 0x73, 0x29, 0x4d, 0xf1, 0xc8, 0x2c, 0xe0, 0xd0, 0xe9, 0xed, 0x0c, 0x70, 0x75, 0x05, 0x3e, + 0x5b, 0xf6, 0xa0, 0x6e, 0xea, 0xde, 0x87, 0x0b, 0x06, 0x29, 0x03, 0xbf, 0xb4, 0x85, 0x9d, 0x04, + 0x75, 0x1a, 0xcd, 0x1e, 0xd6, 0xaa, 0x1b, 0x05, 0x24, 0x6a, 0x2c, 0x80, 0x69, 0x68, 0xdc, 0x97 +}; + // CN XTL const static uint8_t test_output_xtl[160] = { - 0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3, - 0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1, - 0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90, - 0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF, - 0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3, - 0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86, - 0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D, - 0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF, - 0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA, - 0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48 + 0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3, + 0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1, + 0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90, + 0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF, + 0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3, + 0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86, + 0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D, + 0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF, + 0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA, + 0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48 }; const static uint8_t test_output_v0_lite[160] = { - 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, - 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, - 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, - 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, - 0x38, 0x08, 0xE1, 0x17, 0x0B, 0x99, 0x8D, 0x1A, 0x3C, 0xCE, 0x35, 0xC5, 0xC7, 0x3A, 0x00, 0x2E, - 0xCB, 0x54, 0xF0, 0x78, 0x2E, 0x9E, 0xDB, 0xC7, 0xDF, 0x2E, 0x71, 0x9A, 0x16, 0x97, 0xC4, 0x18, - 0x4B, 0x97, 0x07, 0xFE, 0x5D, 0x98, 0x9A, 0xD6, 0xD8, 0xE5, 0x92, 0x66, 0x87, 0x7F, 0x19, 0x37, - 0xA2, 0x5E, 0xE6, 0x96, 0xB5, 0x97, 0x33, 0x89, 0xE0, 0xA7, 0xC9, 0xDD, 0x4A, 0x7E, 0x9E, 0x53, - 0xBE, 0x91, 0x2B, 0xF5, 0xF5, 0xAF, 0xDD, 0x09, 0xA2, 0xF4, 0xA4, 0x56, 0xEB, 0x96, 0x22, 0xC9, - 0x94, 0xFB, 0x7B, 0x28, 0xC9, 0x97, 0x65, 0x04, 0xAC, 0x4F, 0x84, 0x71, 0xDA, 0x6E, 0xD8, 0xC5 + 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, + 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, + 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, + 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, + 0x38, 0x08, 0xE1, 0x17, 0x0B, 0x99, 0x8D, 0x1A, 0x3C, 0xCE, 0x35, 0xC5, 0xC7, 0x3A, 0x00, 0x2E, + 0xCB, 0x54, 0xF0, 0x78, 0x2E, 0x9E, 0xDB, 0xC7, 0xDF, 0x2E, 0x71, 0x9A, 0x16, 0x97, 0xC4, 0x18, + 0x4B, 0x97, 0x07, 0xFE, 0x5D, 0x98, 0x9A, 0xD6, 0xD8, 0xE5, 0x92, 0x66, 0x87, 0x7F, 0x19, 0x37, + 0xA2, 0x5E, 0xE6, 0x96, 0xB5, 0x97, 0x33, 0x89, 0xE0, 0xA7, 0xC9, 0xDD, 0x4A, 0x7E, 0x9E, 0x53, + 0xBE, 0x91, 0x2B, 0xF5, 0xF5, 0xAF, 0xDD, 0x09, 0xA2, 0xF4, 0xA4, 0x56, 0xEB, 0x96, 0x22, 0xC9, + 0x94, 0xFB, 0x7B, 0x28, 0xC9, 0x97, 0x65, 0x04, 0xAC, 0x4F, 0x84, 0x71, 0xDA, 0x6E, 0xD8, 0xC5 }; // CN-Lite v7 const static uint8_t test_output_v1_lite[160] = { - 0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22, - 0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41, - 0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45, - 0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F, - 0x16, 0x08, 0x74, 0xC7, 0xA2, 0xD2, 0xA3, 0x97, 0x95, 0x76, 0xCA, 0x4D, 0x06, 0x39, 0x7A, 0xAB, - 0x6C, 0x87, 0x58, 0x33, 0x4D, 0xC8, 0x5A, 0xAB, 0x04, 0x27, 0xFE, 0x8B, 0x1C, 0x23, 0x2F, 0x32, - 0xC0, 0x44, 0xFF, 0x0D, 0xB5, 0x3B, 0x27, 0x96, 0x06, 0x89, 0x7B, 0xA3, 0x0B, 0xD0, 0xCE, 0x9E, - 0x90, 0x22, 0x77, 0x5A, 0xAD, 0xA1, 0xE5, 0xB6, 0xFC, 0xCB, 0x39, 0x7E, 0x2B, 0x10, 0xEE, 0xB4, - 0x8C, 0x2B, 0xA4, 0x1F, 0x60, 0x76, 0x39, 0xD7, 0xF6, 0x46, 0x77, 0x18, 0x20, 0xAD, 0xD4, 0xC9, - 0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6 + 0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22, + 0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41, + 0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45, + 0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F, + 0x16, 0x08, 0x74, 0xC7, 0xA2, 0xD2, 0xA3, 0x97, 0x95, 0x76, 0xCA, 0x4D, 0x06, 0x39, 0x7A, 0xAB, + 0x6C, 0x87, 0x58, 0x33, 0x4D, 0xC8, 0x5A, 0xAB, 0x04, 0x27, 0xFE, 0x8B, 0x1C, 0x23, 0x2F, 0x32, + 0xC0, 0x44, 0xFF, 0x0D, 0xB5, 0x3B, 0x27, 0x96, 0x06, 0x89, 0x7B, 0xA3, 0x0B, 0xD0, 0xCE, 0x9E, + 0x90, 0x22, 0x77, 0x5A, 0xAD, 0xA1, 0xE5, 0xB6, 0xFC, 0xCB, 0x39, 0x7E, 0x2B, 0x10, 0xEE, 0xB4, + 0x8C, 0x2B, 0xA4, 0x1F, 0x60, 0x76, 0x39, 0xD7, 0xF6, 0x46, 0x77, 0x18, 0x20, 0xAD, 0xD4, 0xC9, + 0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6 }; // CN-Lite IPBC const static uint8_t test_output_ipbc_lite[160] = { - 0xE4, 0x93, 0x8C, 0xAA, 0x59, 0x8D, 0x02, 0x8A, 0xB8, 0x6F, 0x25, 0xD2, 0xB1, 0x23, 0xD0, 0xD5, - 0x33, 0xE3, 0x9F, 0x37, 0xAC, 0xE5, 0xF8, 0xEB, 0x7A, 0xE8, 0x40, 0xEB, 0x5D, 0xB1, 0x35, 0x5F, - 0xB2, 0x47, 0x86, 0xF0, 0x7F, 0x6F, 0x4B, 0x55, 0x3E, 0xA1, 0xBB, 0xE8, 0xA1, 0x75, 0x00, 0x2D, - 0x07, 0x9A, 0x21, 0x0E, 0xBD, 0x06, 0x6A, 0xB0, 0xFD, 0x96, 0x9E, 0xE6, 0xE4, 0x69, 0x67, 0xBB, - 0x88, 0x45, 0x0B, 0x91, 0x0B, 0x7B, 0xCB, 0x21, 0x3C, 0x3C, 0x09, 0x30, 0x07, 0x71, 0x07, 0xD5, - 0xB8, 0x2D, 0x83, 0x09, 0xAF, 0x7E, 0xB2, 0xA8, 0xAC, 0x25, 0xDC, 0x10, 0xF8, 0x63, 0x6A, 0xBC, - 0x73, 0x01, 0x4E, 0xA8, 0x1C, 0xDA, 0x9A, 0x86, 0x17, 0xEC, 0xA8, 0xFB, 0xAA, 0x23, 0x23, 0x17, - 0xE1, 0x32, 0x68, 0x9C, 0x4C, 0xF4, 0x08, 0xED, 0xB0, 0x15, 0xC3, 0xA9, 0x0F, 0xF0, 0xA2, 0x7E, - 0xD9, 0xE4, 0x23, 0xA7, 0x9E, 0x91, 0xD8, 0x73, 0x94, 0xD6, 0x6C, 0x70, 0x9B, 0x8B, 0x72, 0x92, - 0xA3, 0xA4, 0x0A, 0xE2, 0x3C, 0x0A, 0x34, 0x88, 0xA1, 0x6D, 0xFE, 0x02, 0x44, 0x60, 0x7B, 0x3D + 0xE4, 0x93, 0x8C, 0xAA, 0x59, 0x8D, 0x02, 0x8A, 0xB8, 0x6F, 0x25, 0xD2, 0xB1, 0x23, 0xD0, 0xD5, + 0x33, 0xE3, 0x9F, 0x37, 0xAC, 0xE5, 0xF8, 0xEB, 0x7A, 0xE8, 0x40, 0xEB, 0x5D, 0xB1, 0x35, 0x5F, + 0xB2, 0x47, 0x86, 0xF0, 0x7F, 0x6F, 0x4B, 0x55, 0x3E, 0xA1, 0xBB, 0xE8, 0xA1, 0x75, 0x00, 0x2D, + 0x07, 0x9A, 0x21, 0x0E, 0xBD, 0x06, 0x6A, 0xB0, 0xFD, 0x96, 0x9E, 0xE6, 0xE4, 0x69, 0x67, 0xBB, + 0x88, 0x45, 0x0B, 0x91, 0x0B, 0x7B, 0xCB, 0x21, 0x3C, 0x3C, 0x09, 0x30, 0x07, 0x71, 0x07, 0xD5, + 0xB8, 0x2D, 0x83, 0x09, 0xAF, 0x7E, 0xB2, 0xA8, 0xAC, 0x25, 0xDC, 0x10, 0xF8, 0x63, 0x6A, 0xBC, + 0x73, 0x01, 0x4E, 0xA8, 0x1C, 0xDA, 0x9A, 0x86, 0x17, 0xEC, 0xA8, 0xFB, 0xAA, 0x23, 0x23, 0x17, + 0xE1, 0x32, 0x68, 0x9C, 0x4C, 0xF4, 0x08, 0xED, 0xB0, 0x15, 0xC3, 0xA9, 0x0F, 0xF0, 0xA2, 0x7E, + 0xD9, 0xE4, 0x23, 0xA7, 0x9E, 0x91, 0xD8, 0x73, 0x94, 0xD6, 0x6C, 0x70, 0x9B, 0x8B, 0x72, 0x92, + 0xA3, 0xA4, 0x0A, 0xE2, 0x3C, 0x0A, 0x34, 0x88, 0xA1, 0x6D, 0xFE, 0x02, 0x44, 0x60, 0x7B, 0x3D }; // CN-Heavy const static uint8_t test_output_heavy[160] = { - 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, - 0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2, - 0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A, - 0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D, - 0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7, - 0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD, - 0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1, - 0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9, - 0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51, - 0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB + 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, + 0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2, + 0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A, + 0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D, + 0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7, + 0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD, + 0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1, + 0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9, + 0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51, + 0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB }; // CN-Heavy Haven const static uint8_t test_output_heavy_haven[96] = { - 0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57, - 0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6, - 0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F, - 0xDE, 0x18, 0x29, 0x8E, 0xBB, 0x34, 0x2B, 0xEF, 0x7A, 0x04, 0x22, 0xD1, 0xB1, 0xF2, 0x48, 0xDA, - 0xE3, 0x7F, 0x4B, 0x4C, 0xB4, 0xDF, 0xE8, 0xD3, 0x70, 0xE2, 0xE7, 0x44, 0x25, 0x87, 0x12, 0xF9, - 0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B + 0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57, + 0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6, + 0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F, + 0xDE, 0x18, 0x29, 0x8E, 0xBB, 0x34, 0x2B, 0xEF, 0x7A, 0x04, 0x22, 0xD1, 0xB1, 0xF2, 0x48, 0xDA, + 0xE3, 0x7F, 0x4B, 0x4C, 0xB4, 0xDF, 0xE8, 0xD3, 0x70, 0xE2, 0xE7, 0x44, 0x25, 0x87, 0x12, 0xF9, + 0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B }; // CN-Heavy Tube const static uint8_t test_output_heavy_tube[96] = { - 0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf, - 0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35, - 0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3, - 0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74, - 0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f, - 0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb + 0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf, + 0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35, + 0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3, + 0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74, + 0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f, + 0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb }; #endif /* __CRYPTONIGHT_TEST_H__ */ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 6ac2098d..06b5150c 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -29,6 +29,8 @@ #ifdef __GNUC__ # include +#include + #else # include # define __restrict__ __restrict @@ -37,7 +39,7 @@ #include "crypto/CryptoNight.h" #include "crypto/soft_aes.h" - +#include "AsmOptimization.h" extern "C" { @@ -46,42 +48,35 @@ extern "C" #include "crypto/c_blake256.h" #include "crypto/c_jh.h" #include "crypto/c_skein.h" + +#ifndef XMRIG_NO_ASM + void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0); + void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0); + void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0); + void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); +#endif } -static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) { - blake256_hash(output, input, len); -} - - -static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) { - groestl(input, len * 8, output); -} - - -static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) { - jh_hash(32 * 8, input, 8 * len, output); -} - - -static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) { - xmr_skein(input, output); -} - - -void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; +#ifdef __GNUC__ +#define LIKELY(X) __builtin_expect(X, 1) +#define UNLIKELY(X) __builtin_expect(X, 0) +#else +#define LIKELY(X) X +#define UNLIKELY(X) X +#endif #if defined(__x86_64__) || defined(_M_AMD64) # define EXTRACT64(X) _mm_cvtsi128_si64(X) # ifdef __GNUC__ - static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi) { unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b; *hi = r >> 64; return (uint64_t) r; } - # else #define __umul128 _umul128 # endif @@ -120,6 +115,71 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin } #endif +#ifdef _MSC_VER +#else +#endif + +#ifdef _MSC_VER +# define SET_ROUNDING_MODE_UP() _control87(RC_UP, MCW_RC); +#else +# define SET_ROUNDING_MODE_UP() std::fesetround(FE_UPWARD); +#endif + +# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax) \ +{ \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \ +} + +# define INTEGER_MATH_V2(idx, cl, cx) \ +{ \ + const uint64_t cx_ = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm##idx)) ^ (sqrt_result##idx << 32); \ + const uint32_t d_ = (cx_ + (sqrt_result##idx << 1)) | 0x80000001UL; \ + const uint64_t cx1_ = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx1_ / d_) + ((cx1_ % d_) << 32); \ + division_result_xmm##idx = _mm_cvtsi64_si128(static_cast(division_result)); \ + sqrt_result##idx = int_sqrt_v2(cx_ + division_result); \ +} + +# define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi) \ +{ \ + const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \ + hi ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[0]; \ + lo ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[1]; \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \ +} + +static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) { + blake256_hash(output, input, len); +} + + +static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) { + groestl(input, len * 8, output); +} + + +static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) { + jh_hash(32 * 8, input, 8 * len, output); +} + + +static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) { + xmr_skein(input, output); +} + + +void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; + // This will shift and xor tmp1 into itself as 4 32-bit vals such as // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1) @@ -467,6 +527,37 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou _mm_store_si128(output + 11, xout7); } +static inline void int_sqrt_v2_fixup(uint64_t& r, uint64_t n0) +{ + if (LIKELY(r & 524287)) + { + r >>= 19; + return; + } + + --r; + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); +#if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64)) + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); +#else + // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence + // Fallback to simpler code + if (x2 < n0) ++r; +#endif +} + +static inline uint64_t int_sqrt_v2(uint64_t n0) +{ +__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); +x = _mm_sqrt_sd(_mm_setzero_pd(), x); +uint64_t r = static_cast(_mm_cvtsi128_si64(_mm_castpd_si128(x))); +int_sqrt_v2_fixup(r, n0); +return r; +} + // n-Loop version. Seems to be little bit slower then the hardcoded one. template class CryptoNightMultiHash @@ -481,8 +572,10 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); @@ -502,22 +595,27 @@ public: for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); } + } - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } - uint64_t hi, lo, cl, ch; + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; lo = __umul128(idx[hashBlock], cl, &hi); @@ -531,6 +629,8 @@ public: ah[hashBlock] ^= ch; al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; + + bx[hashBlock] = cx[hashBlock]; } } @@ -551,9 +651,11 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; uint64_t tweak1_2[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); @@ -575,26 +677,33 @@ public: for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); } + } - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } - uint64_t hi, lo, cl, ch; + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; lo = __umul128(idx[hashBlock], cl, &hi); @@ -612,6 +721,8 @@ public: ah[hashBlock] ^= ch; al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; + + bx[hashBlock] = cx[hashBlock]; } } @@ -623,6 +734,133 @@ public: } } + inline static void hashPowV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + + + // multi + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t sqrt_result[NUM_HASH_BLOCKS]; + __m128i bx0[NUM_HASH_BLOCKS]; + __m128i bx1[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; + __m128i division_result_xmm[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); + + cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx0[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + bx1[hashBlock] = _mm_set_epi64x(h[hashBlock][9] ^ h[hashBlock][11], h[hashBlock][8] ^ h[hashBlock][10]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + + division_result_xmm[hashBlock] = _mm_cvtsi64_si128(h[hashBlock][12]); + sqrt_result[hashBlock] = h[hashBlock][13]; + } + + SET_ROUNDING_MODE_UP(); + + uint64_t sqrt_result0; + __m128i division_result_xmm0; + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); + + if (SOFT_AES) { + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); + } else { + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); + } + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + SHUFFLE_PHASE_1(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock]) + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx0[hashBlock], cx[hashBlock])); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } + + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cl = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1]; + + sqrt_result0 = sqrt_result[hashBlock]; + division_result_xmm0 = division_result_xmm[hashBlock]; + + INTEGER_MATH_V2(0, cl, cx[hashBlock]) + + sqrt_result[hashBlock] = sqrt_result0; + division_result_xmm[hashBlock] = division_result_xmm0; + + lo = __umul128(idx[hashBlock], cl, &hi); + + SHUFFLE_PHASE_2(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock], lo, hi) + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; + + bx1[hashBlock] = bx0[hashBlock]; + bx0[hashBlock] = cx[hashBlock]; + } + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); + } + } + + inline static void hashPowV3_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -632,9 +870,11 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; uint64_t tweak1_2[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); @@ -650,34 +890,40 @@ public: al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = - _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; } for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); } + } - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } - uint64_t hi, lo, cl, ch; + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; lo = __umul128(idx[hashBlock], cl, &hi); @@ -697,6 +943,8 @@ public: ah[hashBlock] ^= ch; al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; + + bx[hashBlock] = cx[hashBlock]; } } @@ -717,8 +965,10 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); @@ -738,22 +988,27 @@ public: for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); } + } - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } - uint64_t hi, lo, cl, ch; + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; lo = __umul128(idx[hashBlock], cl, &hi); @@ -774,6 +1029,8 @@ public: ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; idx[hashBlock] = d ^ q; + + bx[hashBlock] = cx[hashBlock]; } } @@ -781,7 +1038,7 @@ public: cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); + output + hashBlock * 32); } } @@ -794,8 +1051,10 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; + __m128i ax[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); @@ -815,22 +1074,27 @@ public: for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); } + } - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } - uint64_t hi, lo, cl, ch; + uint64_t hi, lo, cl, ch; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; lo = __umul128(idx[hashBlock], cl, &hi); @@ -851,6 +1115,8 @@ public: ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; idx[hashBlock] = (~d) ^ q; + + bx[hashBlock] = cx[hashBlock]; } } @@ -871,9 +1137,10 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; uint64_t idx[NUM_HASH_BLOCKS]; uint64_t tweak1_2[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + __m128i cx[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); @@ -903,36 +1170,48 @@ public: for (size_t i = 0; i < ITERATIONS; i++) { for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; + cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); + } - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + const __m128i &key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - const __m128i& key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); + _mm_store_si128((__m128i *) k, key); + cx[hashBlock] = _mm_xor_si128(cx[hashBlock], _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i *) x, cx[hashBlock]); - _mm_store_si128((__m128i*)k, key); - cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); - _mm_store_si128((__m128i*)x, cx); - - k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; x[0] ^= k[0]; - k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; x[1] ^= k[1]; - k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; x[2] ^= k[2]; - k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; - cx = _mm_load_si128((__m128i*)k); + cx[hashBlock] = _mm_load_si128((__m128i *) k); + } - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], + _mm_xor_si128(bx[hashBlock], cx[hashBlock])); + } - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + } - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + idx[hashBlock] = EXTRACT64(cx[hashBlock]); + } + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; @@ -960,6 +1239,8 @@ public: ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; idx[hashBlock] = d ^ q; + + bx[hashBlock] = cx[hashBlock]; } } @@ -1105,6 +1386,142 @@ public: extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } + inline static void hashPowV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + + const uint8_t*l = scratchPad[0]->memory; + uint64_t* h = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + +#ifndef XMRIG_NO_ASM + if (SOFT_AES) { + scratchPad[0]->input = input; + scratchPad[0]->variant1_table = variant1_table; + scratchPad[0]->t_fn = (const uint32_t*)saes_table; + cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } else { + scratchPad[0]->input = input; + scratchPad[0]->variant1_table = variant1_table; + cnv1_mainloop_sandybridge_asm(scratchPad[0]); + } +#endif + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + + // single + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + + const uint8_t*l = scratchPad[0]->memory; + uint64_t* h = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + + uint64_t al = h[0] ^ h[4]; + uint64_t ah = h[1] ^ h[5]; + __m128i bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + __m128i bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]); + + uint64_t idx = h[0] ^ h[4]; + + __m128i division_result_xmm0 = _mm_cvtsi64_si128(h[12]); + uint64_t sqrt_result0 = h[13]; + + SET_ROUNDING_MODE_UP(); + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; + + const __m128i ax = _mm_set_epi64x(ah, al); + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l[idx & MASK], ax); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, ax); + } + + SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax) + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx)); + + idx = EXTRACT64(cx); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx) + + lo = __umul128(idx, cl, &hi); + + SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi) + + al += hi; // two fence statements are overhead + ah += lo; + + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + + ah ^= ch; + al ^= cl; + idx = al; + + bx1 = bx0; + bx0 = cx; + } + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + + + // single asm + inline static void hashPowV3_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + const uint8_t* l = scratchPad[0]->memory; + uint64_t* h = reinterpret_cast(scratchPad[0]->state); + + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + +#ifndef XMRIG_NO_ASM + if (asmOptimization == AsmOptimization::ASM_INTEL) { + if (SOFT_AES) { + scratchPad[0]->input = input; + scratchPad[0]->t_fn = (const uint32_t*)saes_table; + cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } else { + cnv2_mainloop_ivybridge_asm(scratchPad[0]); + } + } else if (asmOptimization == AsmOptimization::ASM_RYZEN) { + cnv2_mainloop_ryzen_asm(scratchPad[0]); + } +#endif + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -1605,6 +2022,209 @@ public: extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } + inline static void hashPowV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + + // double + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + __m128i division_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[12]), _mm_cvtsi64_si128(h1[12])); + __m128i sqrt_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[13]), _mm_cvtsi64_si128(h1[13])); + + SET_ROUNDING_MODE_UP() + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + } + + SHUFFLE_PHASE_1(l0, (idx0 & MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1 & MASK), bx01, bx11, ax1) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + const uint64_t sqrt_result0 = _mm_cvtsi128_si64(sqrt_result_xmm); + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result0 << 32); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0 & MASK), bx00, bx10, ax0, lo, hi) + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + const uint64_t sqrt_result1 = _mm_cvtsi128_si64(_mm_srli_si128(sqrt_result_xmm, 8)); + cl ^= static_cast(_mm_cvtsi128_si64(_mm_srli_si128(division_result_xmm, 8))) ^ (sqrt_result1 << 32); + + const __m128i sqrt_result2 = _mm_add_epi64(_mm_slli_epi64(sqrt_result_xmm, 1), _mm_unpacklo_epi64(cx0, cx1)); + const uint32_t d0 = _mm_cvtsi128_si64(sqrt_result2) | 0x80000001UL; + const uint32_t d1 = _mm_cvtsi128_si64(_mm_srli_si128(sqrt_result2, 8)) | 0x80000001UL; + + const uint64_t cx01 = _mm_cvtsi128_si64(_mm_srli_si128(cx0, 8)); + const uint64_t cx11 = _mm_cvtsi128_si64(_mm_srli_si128(cx1, 8)); + __m128d x = _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), (cx01 + 1) >> 1), _mm_cvtsi64_sd(_mm_setzero_pd(), (cx11 + 1) >> 1)); + __m128d y = _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), d0), _mm_cvtsi64_sd(_mm_setzero_pd(), d1)); + + __m128d result = _mm_div_pd(x, y); + result = _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(result), _mm_set_epi64x(1ULL << 52, 1ULL << 52))); + + uint64_t q0 = _mm_cvttsd_si64(result); + uint64_t q1 = _mm_cvttsd_si64(_mm_castsi128_pd(_mm_srli_si128(_mm_castpd_si128(result), 8))); + + uint64_t r0 = cx01 - d0 * q0; + if (UNLIKELY(int64_t(r0) < 0)) + { + --q0; + r0 += d0; + } + uint64_t r1 = cx11 - d1 * q1; + if (UNLIKELY(int64_t(r1) < 0)) + { + --q1; + r1 += d1; + } + + division_result_xmm = _mm_set_epi32(r1, q1, r0, q0); + + __m128i sqrt_input = _mm_add_epi64(_mm_unpacklo_epi64(cx0, cx1), division_result_xmm); + x = _mm_castsi128_pd(_mm_add_epi64(_mm_srli_epi64(sqrt_input, 12), _mm_set_epi64x(1023ULL << 52, 1023ULL << 52))); + + x = _mm_sqrt_pd(x); + + r0 = static_cast(_mm_cvtsi128_si64(_mm_castpd_si128(x))); + int_sqrt_v2_fixup(r0, _mm_cvtsi128_si64(sqrt_input)); + r1 = static_cast(_mm_cvtsi128_si64(_mm_srli_si128(_mm_castpd_si128(x), 8))); + int_sqrt_v2_fixup(r1, _mm_cvtsi128_si64(_mm_srli_si128(sqrt_input, 8))); + sqrt_result_xmm = _mm_set_epi64x(r1, r0); + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_PHASE_2(l1, (idx1 & MASK), bx01, bx11, ax1, lo, hi) + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + + // double asm + inline static void hashPowV3_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + +#ifndef XMRIG_NO_ASM + cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); +#endif + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -2216,155 +2836,341 @@ public: extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } - inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); - keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + inline static void hashPowV2(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); - uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(scratchPad[0]->state) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(scratchPad[1]->state) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(scratchPad[2]->state) + 24)); + uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - const uint8_t* l2 = scratchPad[2]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); - cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); - } + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + } - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); - static const uint32_t table = 0x75310; - uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; - uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); - al0 += hi; - ah0 += lo; + al0 += hi; + ah0 += lo; - ah0 ^= tweak1_2_0; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah0 ^= tweak1_2_0; + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); - al1 += hi; - ah1 += lo; + al1 += hi; + ah1 += lo; - ah1 ^= tweak1_2_1; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - ah1 ^= tweak1_2_1; + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); - al2 += hi; - ah2 += lo; + al2 += hi; + ah2 += lo; - ah2 ^= tweak1_2_2; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; - ah2 ^= tweak1_2_2; + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; - ah2 ^= ch; - al2 ^= cl; - idx2 = al2; - } + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + } - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); - } + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + + inline static void hashPowV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + + // triple + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + SET_ROUNDING_MODE_UP(); + + __m128i division_result_xmm0 = _mm_cvtsi64_si128(h0[12]); + __m128i division_result_xmm1 = _mm_cvtsi64_si128(h1[12]); + __m128i division_result_xmm2 = _mm_cvtsi64_si128(h2[12]); + + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; + uint64_t sqrt_result2 = h2[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + } + + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx0); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + INTEGER_MATH_V2(1, cl, cx1); + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + INTEGER_MATH_V2(2, cl, cx2); + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi) + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + + inline static void hashPowV3_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, @@ -3198,197 +4004,7 @@ public: extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); } - inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); - keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); - keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); - - uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(scratchPad[0]->state) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(scratchPad[1]->state) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(scratchPad[2]->state) + 24)); - uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ - *(reinterpret_cast(scratchPad[3]->state) + 24)); - - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - const uint8_t* l2 = scratchPad[2]->memory; - const uint8_t* l3 = scratchPad[3]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); - uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); - - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); - cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t al3 = h3[0] ^h3[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; - uint64_t ah3 = h3[1] ^h3[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; - uint64_t idx3 = h3[0] ^h3[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; - __m128i cx3; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); - } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); - cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); - - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); - cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); - cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); - } - - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); - _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); - - static const uint32_t table = 0x75310; - uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; - uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); - idx3 = EXTRACT64(cx3); - - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; - bx3 = cx3; - - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ah0 ^= tweak1_2_0; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah0 ^= tweak1_2_0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - - al1 += hi; - ah1 += lo; - - ah1 ^= tweak1_2_1; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - ah1 ^= tweak1_2_1; - - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; - - - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); - - al2 += hi; - ah2 += lo; - - ah2 ^= tweak1_2_2; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; - ah2 ^= tweak1_2_2; - - ah2 ^= ch; - al2 ^= cl; - idx2 = al2; - - - cl = ((uint64_t*) &l3[idx3 & MASK])[0]; - ch = ((uint64_t*) &l3[idx3 & MASK])[1]; - lo = __umul128(idx3, cl, &hi); - - al3 += hi; - ah3 += lo; - - ah3 ^= tweak1_2_3; - ((uint64_t*) &l3[idx3 & MASK])[0] = al3; - ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; - ah3 ^= tweak1_2_3; - - ah3 ^= ch; - al3 ^= cl; - idx3 = al3; - } - - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); - cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); - - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - keccakf(h3, 24); - - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); - extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); - } - - inline static void hashLiteTube(const uint8_t* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) @@ -3493,6 +4109,429 @@ public: bx3 = cx3; + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ah3 ^= tweak1_2_3; + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + ah3 ^= tweak1_2_3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + } + + inline static void hashPowV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + + // quadruple + inline static void hashPowV3(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + SET_ROUNDING_MODE_UP(); + + __m128i division_result_xmm0 = _mm_cvtsi64_si128(h0[12]); + __m128i division_result_xmm1 = _mm_cvtsi64_si128(h1[12]); + __m128i division_result_xmm2 = _mm_cvtsi64_si128(h2[12]); + __m128i division_result_xmm3 = _mm_cvtsi64_si128(h3[12]); + + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; + uint64_t sqrt_result2 = h2[13]; + uint64_t sqrt_result3 = h3[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], ax3); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + } + + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx0); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + INTEGER_MATH_V2(1, cl, cx1); + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + INTEGER_MATH_V2(2, cl, cx2); + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + + INTEGER_MATH_V2(3, cl, cx3); + + lo = __umul128(idx3, cl, &hi); + + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + } + + inline static void hashPowV3_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); + uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ + *(reinterpret_cast(scratchPad[3]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + + uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l0[idx0 & MASK])[0]; ch = ((uint64_t*) &l0[idx0 & MASK])[1]; @@ -3806,235 +4845,512 @@ public: extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); } - inline static void hashPowV2(const uint8_t* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); + uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ + *(reinterpret_cast(scratchPad[3]->state) + 24)); + uint64_t tweak1_2_4 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 4 * size) ^ + *(reinterpret_cast(scratchPad[4]->state) + 24)); + + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l4[idx4 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + bx4 = cx4; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ah3 ^= tweak1_2_3; + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + ah3 ^= tweak1_2_3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; + lo = __umul128(idx4, cl, &hi); + + al4 += hi; + ah4 += lo; + + ah4 ^= tweak1_2_4; + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + ah4 ^= tweak1_2_4; + + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); + } + + inline static void hashPowV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + + // quintuple + inline static void hashPowV3(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) - { - keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); - keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); - keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); - keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200); + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200); - uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(scratchPad[0]->state) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(scratchPad[1]->state) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(scratchPad[2]->state) + 24)); - uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ - *(reinterpret_cast(scratchPad[3]->state) + 24)); - uint64_t tweak1_2_4 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 4 * size) ^ - *(reinterpret_cast(scratchPad[4]->state) + 24)); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx04 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + __m128i bx14 = _mm_set_epi64x(h4[9] ^ h4[11], h4[8] ^ h4[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + SET_ROUNDING_MODE_UP(); + + __m128i division_result_xmm0 = _mm_cvtsi64_si128(h0[12]); + __m128i division_result_xmm1 = _mm_cvtsi64_si128(h1[12]); + __m128i division_result_xmm2 = _mm_cvtsi64_si128(h2[12]); + __m128i division_result_xmm3 = _mm_cvtsi64_si128(h3[12]); + __m128i division_result_xmm4 = _mm_cvtsi64_si128(h4[12]); + + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; + uint64_t sqrt_result2 = h2[13]; + uint64_t sqrt_result3 = h3[13]; + uint64_t sqrt_result4 = h4[13]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + const __m128i ax4 = _mm_set_epi64x(ah4, al4); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], ax3); + cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], ax4); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + cx4 = _mm_aesenc_si128(cx4, ax4); + } + + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) + SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx04, cx4)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + INTEGER_MATH_V2(0, cl, cx0); + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - const uint8_t* l2 = scratchPad[2]->memory; - const uint8_t* l3 = scratchPad[3]->memory; - const uint8_t* l4 = scratchPad[4]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); - uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); - uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); - cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); - cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); + INTEGER_MATH_V2(1, cl, cx1); - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t al3 = h3[0] ^h3[4]; - uint64_t al4 = h4[0] ^h4[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; - uint64_t ah3 = h3[1] ^h3[5]; - uint64_t ah4 = h4[1] ^h4[5]; + lo = __umul128(idx1, cl, &hi); - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; - uint64_t idx3 = h3[0] ^h3[4]; - uint64_t idx4 = h4[0] ^h4[4]; + al1 += hi; + ah1 += lo; - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; - __m128i cx3; - __m128i cx4; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); - cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); - } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); - cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); - cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); - cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); - cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); - cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4)); - } - - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); - _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); - _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4)); - - static const uint32_t table = 0x75310; - uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; - uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l4[idx4 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); - idx3 = EXTRACT64(cx3); - idx4 = EXTRACT64(cx4); - - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; - bx3 = cx3; - bx4 = cx4; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ah0 ^= tweak1_2_0; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah0 ^= tweak1_2_0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; + bx11 = bx01; + bx01 = cx1; - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - al1 += hi; - ah1 += lo; + INTEGER_MATH_V2(2, cl, cx2); - ah1 ^= tweak1_2_1; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - ah1 ^= tweak1_2_1; + lo = __umul128(idx2, cl, &hi); - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; - al2 += hi; - ah2 += lo; + INTEGER_MATH_V2(3, cl, cx3); - ah2 ^= tweak1_2_2; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; - ah2 ^= tweak1_2_2; + lo = __umul128(idx3, cl, &hi); - ah2 ^= ch; - al2 ^= cl; - idx2 = al2; + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; - cl = ((uint64_t*) &l3[idx3 & MASK])[0]; - ch = ((uint64_t*) &l3[idx3 & MASK])[1]; - lo = __umul128(idx3, cl, &hi); + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; - al3 += hi; - ah3 += lo; + INTEGER_MATH_V2(4, cl, cx4); - ah3 ^= tweak1_2_3; - ((uint64_t*) &l3[idx3 & MASK])[0] = al3; - ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; - ah3 ^= tweak1_2_3; + lo = __umul128(idx4, cl, &hi); - ah3 ^= ch; - al3 ^= cl; - idx3 = al3; + SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi); + al4 += hi; + ah4 += lo; - cl = ((uint64_t*) &l4[idx4 & MASK])[0]; - ch = ((uint64_t*) &l4[idx4 & MASK])[1]; - lo = __umul128(idx4, cl, &hi); + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; - al4 += hi; - ah4 += lo; + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; - ah4 ^= tweak1_2_4; - ((uint64_t*) &l4[idx4 & MASK])[0] = al4; - ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; - ah4 ^= tweak1_2_4; + bx14 = bx04; + bx04 = cx4; + } - ah4 ^= ch; - al4 ^= cl; - idx4 = al4; - } + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); - cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); - cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - keccakf(h3, 24); - keccakf(h4, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); + } - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); - extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); - extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); - } + inline static void hashPowV3_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S new file mode 100644 index 00000000..555f3fb1 --- /dev/null +++ b/src/crypto/asm/cn_main_loop.S @@ -0,0 +1,88 @@ +#define ALIGN .align +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif +.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) +.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) +.global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) + +.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv1_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv1_mainloop_sandybridge.inc" + add rsp, 48 + ret 0 +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_ryzen.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cnv2_double_main_loop_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv1_mainloop_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_mainloop_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm new file mode 100644 index 00000000..00cf6d09 --- /dev/null +++ b/src/crypto/asm/cn_main_loop.asm @@ -0,0 +1,71 @@ +_TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cnv1_mainloop_sandybridge_asm +PUBLIC cnv2_mainloop_ivybridge_asm +PUBLIC cnv2_mainloop_ryzen_asm +PUBLIC cnv2_double_mainloop_sandybridge_asm + +PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm +PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +cnv1_mainloop_sandybridge_asm PROC + INCLUDE cnv1_mainloop_sandybridge.inc + ret 0 +cnv1_mainloop_sandybridge_asm ENDP + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +cnv2_mainloop_ivybridge_asm PROC + INCLUDE cnv2_main_loop_ivybridge.inc + ret 0 +cnv2_mainloop_ivybridge_asm ENDP + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +cnv2_mainloop_ryzen_asm PROC + INCLUDE cnv2_main_loop_ryzen.inc + ret 0 +cnv2_mainloop_ryzen_asm ENDP + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +cnv2_double_mainloop_sandybridge_asm PROC + INCLUDE cnv2_double_main_loop_sandybridge.inc + ret 0 +cnv2_double_mainloop_sandybridge_asm ENDP + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +cnv1_mainloop_soft_aes_sandybridge_asm PROC + INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc + ret 0 +cnv1_mainloop_soft_aes_sandybridge_asm ENDP + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +cnv2_mainloop_soft_aes_sandybridge_asm PROC + INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc + ret 0 +cnv2_mainloop_soft_aes_sandybridge_asm ENDP + +_TEXT_CN_MAINLOOP ENDS +END diff --git a/src/crypto/asm/cn_main_loop_win_gcc.S b/src/crypto/asm/cn_main_loop_win_gcc.S new file mode 100644 index 00000000..4edb17f8 --- /dev/null +++ b/src/crypto/asm/cn_main_loop_win_gcc.S @@ -0,0 +1,42 @@ +#define ALIGN .align +.intel_syntax noprefix +# define FN_PREFIX(fn) fn +.section .text + +.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) +.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) +.global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) + +.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) + +ALIGN 64 +FN_PREFIX(cnv1_mainloop_sandybridge_asm): + #include "cnv1_mainloop_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_mainloop_ivybridge_asm): + #include "cnv2_main_loop_ivybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_mainloop_ryzen_asm): + #include "cnv2_main_loop_ryzen.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): + #include "cnv2_double_main_loop_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): + #include "cnv1_mainloop_soft_aes_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): + #include "cnv2_mainloop_soft_aes_sandybridge.inc" + ret 0 \ No newline at end of file diff --git a/src/crypto/asm/cnv1_mainloop_sandybridge.inc b/src/crypto/asm/cnv1_mainloop_sandybridge.inc new file mode 100644 index 00000000..89cc15e8 --- /dev/null +++ b/src/crypto/asm/cnv1_mainloop_sandybridge.inc @@ -0,0 +1,74 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r14 + push r15 + mov rax, QWORD PTR [rcx+48] + mov ebp, 524288 + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm3, rax + mov rax, QWORD PTR [rcx+256] + mov rdi, QWORD PTR [rcx+40] + movq xmm0, rdx + xor rdi, QWORD PTR [rcx+8] + mov rdx, r8 + mov r15, QWORD PTR [rcx+264] + and edx, 2097136 + mov r14, QWORD PTR [rax+35] + xor r14, QWORD PTR [rcx+192] + mov rsi, QWORD PTR [rcx+224] + punpcklqdq xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cnv1_mainloop_sandybridge: + movq xmm0, rdi + movq xmm1, r8 + punpcklqdq xmm1, xmm0 + aesenc xmm2, xmm1 + movq r10, xmm2 + mov r9d, r10d + and r9d, 2097136 + add r9, rsi + movdqa xmm0, xmm2 + pxor xmm0, xmm3 + movdqa xmm3, xmm2 + movdqu XMMWORD PTR [rdx+rsi], xmm0 + psrldq xmm0, 11 + movq rax, xmm0 + movzx eax, al + movzx eax, BYTE PTR [rax+r15] + mov BYTE PTR [rsi+rdx+11], al + mov rbx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + mul r10 + add r8, rdx + mov QWORD PTR [r9], r8 + add rdi, rax + mov rax, r14 + xor rax, rdi + mov QWORD PTR [r9+8], rax + xor r8, rbx + mov rdx, r8 + and edx, 2097136 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + xor rdi, r11 + dec ebp + jne cnv1_mainloop_sandybridge + + mov rbx, QWORD PTR [rsp+24] + mov rbp, QWORD PTR [rsp+32] + mov rsi, QWORD PTR [rsp+40] + mov rdi, QWORD PTR [rsp+48] + pop r15 + pop r14 diff --git a/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..5a28185e --- /dev/null +++ b/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,166 @@ + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 72 + + movaps XMMWORD PTR [rsp], xmm6 + movaps XMMWORD PTR [rsp+16], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + movaps XMMWORD PTR [rsp+48], xmm9 + + mov rax, QWORD PTR [rcx+48] + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm4, rax + mov rax, QWORD PTR [rcx+256] + mov r13, QWORD PTR [rcx+40] + movq xmm0, rdx + xor r13, QWORD PTR [rcx+8] + mov rdx, r8 + mov rdi, QWORD PTR [rcx+224] + and edx, 2097136 + mov rax, QWORD PTR [rax+35] + xor rax, QWORD PTR [rcx+192] + movq xmm5, rax + movq xmm8, rdi + punpcklqdq xmm4, xmm0 + mov QWORD PTR [rsp+64], rdx + + movq xmm6, rcx + mov rax, QWORD PTR [rcx+264] + movq xmm7, rax + + mov eax, 524288 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cnv1_mainloop_soft_aes_sandybridge: + movq xmm9, rax + mov r12, QWORD PTR [rcx+272] + mov esi, DWORD PTR [rdx+rdi] + mov r10d, DWORD PTR [rdx+rdi+4] + mov ebp, DWORD PTR [rdx+rdi+12] + mov r14d, DWORD PTR [rdx+rdi+8] + mov rdx, QWORD PTR [rsp+64] + movzx ecx, sil + shr esi, 8 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + add ebp, 256 + movd xmm1, r11d + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movq rdi, xmm8 + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + punpckldq xmm2, xmm1 + movq xmm1, r8 + xor eax, DWORD PTR [r12+rcx*4] + xor eax, r15d + movd xmm3, eax + movq rax, xmm7 + punpckldq xmm3, xmm0 + movq xmm0, r13 + punpcklqdq xmm1, xmm0 + punpckldq xmm3, xmm2 + pxor xmm3, xmm1 + movq r9, xmm3 + mov r10d, r9d + and r10d, 2097136 + movdqa xmm0, xmm3 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx+rdi], xmm0 + psrldq xmm0, 11 + movq rcx, xmm0 + movzx ecx, cl + mov cl, BYTE PTR [rcx+rax] + mov BYTE PTR [rdi+rdx+11], cl + mov rbx, QWORD PTR [r10+rdi] + mov rcx, r9 + lea r9, QWORD PTR [r10+rdi] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + movdqa xmm4, xmm3 + mul rcx + movq rcx, xmm6 + add r8, rdx + add r13, rax + movq rax, xmm5 + xor rax, r13 + mov QWORD PTR [r9], r8 + xor r8, rbx + mov QWORD PTR [r9+8], rax + movq rax, xmm9 + mov rdx, r8 + xor r13, r11 + and edx, 2097136 + mov QWORD PTR [rsp+64], rdx + sub eax, 1 + jne cnv1_mainloop_soft_aes_sandybridge + + movaps xmm6, XMMWORD PTR [rsp] + movaps xmm7, XMMWORD PTR [rsp+16] + movaps xmm8, XMMWORD PTR [rsp+32] + movaps xmm9, XMMWORD PTR [rsp+48] + + add rsp, 72 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx diff --git a/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..1ea871f3 --- /dev/null +++ b/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc @@ -0,0 +1,414 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/src/crypto/asm/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/cnv2_main_loop_ivybridge.inc new file mode 100644 index 00000000..35ee0627 --- /dev/null +++ b/src/crypto/asm/cnv2_main_loop_ivybridge.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +$main_loop_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test edx, 524287 + je $sqrt_fixup_ivybridge + psrlq xmm3, 19 +$sqrt_fixup_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne $main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp $cnv2_main_loop_ivybridge_endp + +$sqrt_fixup_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp $sqrt_fixup_ivybridge_ret + +$cnv2_main_loop_ivybridge_endp: diff --git a/src/crypto/asm/cnv2_main_loop_ryzen.inc b/src/crypto/asm/cnv2_main_loop_ryzen.inc new file mode 100644 index 00000000..42054413 --- /dev/null +++ b/src/crypto/asm/cnv2_main_loop_ryzen.inc @@ -0,0 +1,183 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +$main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je $sqrt_fixup_ryzen + shr rdi, 19 + +$sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne $main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp $cnv2_main_loop_ryzen_endp + +$sqrt_fixup_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp $sqrt_fixup_ryzen_ret + +$cnv2_main_loop_ryzen_endp: diff --git a/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..bc3da761 --- /dev/null +++ b/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,271 @@ + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 152 + + stmxcsr DWORD PTR [rsp+4] + mov DWORD PTR [rsp], 24448 + ldmxcsr DWORD PTR [rsp] + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+248], rax + movq xmm12, r11 + mov QWORD PTR [rsp+240], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 524288 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cnv2_mainloop_soft_aes_sandybridge: + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+248] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + xor edx, edx + mov rax, rcx + shl rax, 32 + movq rbx, xmm10 + xor rbx, rax + lea r9, QWORD PTR [rcx+rcx] + add r9d, edi + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov ecx, -2147483647 + movdqu XMMWORD PTR [r13], xmm0 + or r9, rcx + movdqa xmm0, xmm6 + movaps xmm1, xmm9 + psrldq xmm0, 8 + movq rax, xmm0 + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + div r9 + shl rdx, 32 + mov eax, eax + add rdx, rax + lea r9, QWORD PTR [rdx+rdi] + movq xmm10, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm1, xmm0 + movq rdx, xmm1 + test rdx, 524287 + je sqrt_fixup_soft_aes_sandybridge + psrlq xmm1, 19 +sqrt_fixup_soft_aes_sandybridge_ret: + + mov r9, r10 + movdqa xmm13, xmm1 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movq xmm0, rax + movq xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+240] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+224] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+240], r9 + mov QWORD PTR [rsp+248], rax + sub r12d, 1 + jne cnv2_mainloop_soft_aes_sandybridge + + ldmxcsr DWORD PTR [rsp+4] + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 152 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp + +sqrt_fixup_soft_aes_sandybridge: + dec rdx + mov r15d, -1022 + shl r15, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + lea rcx, [rcx+r15+1] + add rax, r15 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm1, rdx + jmp sqrt_fixup_soft_aes_sandybridge_ret + +cnv2_mainloop_soft_aes_sandybridge_asm_endp: diff --git a/src/default_config.json b/src/default_config.json index d3081d1b..ec5fcecb 100644 --- a/src/default_config.json +++ b/src/default_config.json @@ -4,8 +4,9 @@ "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) - "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto + "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations + "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, none "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) "colors": true, // false to disable colored output "cpu-affinity": null, // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1 diff --git a/src/interfaces/ILogBackend.h b/src/interfaces/ILogBackend.h index 458b504c..31d264de 100644 --- a/src/interfaces/ILogBackend.h +++ b/src/interfaces/ILogBackend.h @@ -31,6 +31,12 @@ class ILogBackend { public: +# ifdef APP_DEBUG + constexpr static const size_t kBufferSize = 1024; +# else + constexpr static const size_t kBufferSize = 512; +# endif + virtual ~ILogBackend() {} virtual void message(int level, const char* fmt, va_list args) = 0; diff --git a/src/log/FileLog.cpp b/src/log/FileLog.cpp index 7d04e574..8450a51b 100644 --- a/src/log/FileLog.cpp +++ b/src/log/FileLog.cpp @@ -56,19 +56,20 @@ void FileLog::message(int level, const char* fmt, va_list args) localtime_r(&now, &stime); # endif - auto *buf = new char[512]; - int size = snprintf(buf, 23, "[%d-%02d-%02d %02d:%02d:%02d] ", - stime.tm_year + 1900, - stime.tm_mon + 1, - stime.tm_mday, - stime.tm_hour, - stime.tm_min, - stime.tm_sec); + snprintf(m_fmt, sizeof(m_fmt) - 1, "[%d-%02d-%02d %02d:%02d:%02d] %s", + stime.tm_year + 1900, + stime.tm_mon + 1, + stime.tm_mday, + stime.tm_hour, + stime.tm_min, + stime.tm_sec, + fmt); - size = vsnprintf(buf + size, 512 - size - 1, fmt, args) + size; + auto *buf = new char[kBufferSize]; + const int size = vsnprintf(buf, kBufferSize - 1, m_fmt, args); buf[size] = '\n'; - std::string row = std::regex_replace(std::string(buf, size+1), std::regex("\x1B\\[[0-9;]*[a-zA-Z]"), ""); + std::string row = std::regex_replace(std::string(buf, static_cast(size + 1)), std::regex("\x1B\\[[0-9;]*[a-zA-Z]"), ""); memcpy(buf, row.c_str(), row.length()); diff --git a/src/log/FileLog.h b/src/log/FileLog.h index 2b3ca5d4..469347d4 100644 --- a/src/log/FileLog.h +++ b/src/log/FileLog.h @@ -44,6 +44,7 @@ private: void write(char *data, size_t size); + char m_fmt[256]; int m_file; }; diff --git a/src/net/Job.cpp b/src/net/Job.cpp index 8de5034d..c9253489 100644 --- a/src/net/Job.cpp +++ b/src/net/Job.cpp @@ -138,24 +138,24 @@ bool Job::setTarget(const char *target) PowVariant Job::powVariant() const { - if (m_powVariant == PowVariant::POW_AUTODETECT) - { - return (m_blob[0] > 6 ? PowVariant::POW_V1 : PowVariant::POW_V0); + if (m_powVariant == PowVariant::POW_AUTODETECT) { + if (m_blob[0] > 7) { + return PowVariant::POW_V2; + } else if (m_blob[0] > 6) { + return PowVariant::POW_V1; + } else { + return PowVariant::POW_V0; + } } - else if (m_powVariant == PowVariant::POW_XTL && m_blob[0] < 4) - { + else if (m_powVariant == PowVariant::POW_XTL && m_blob[0] < 4) { return POW_V1; } - else if (m_powVariant == PowVariant::POW_MSR && m_blob[0] < 7) - { + else if (m_powVariant == PowVariant::POW_MSR && m_blob[0] < 7) { return POW_V1; } - else if (m_powVariant == PowVariant::POW_XHV && m_blob[0] < 3) - { + else if (m_powVariant == PowVariant::POW_XHV && m_blob[0] < 3) { return POW_V0; - } - else - { + } else { return m_powVariant; } } diff --git a/src/version.h b/src/version.h index f8ce56e2..a8b6849b 100644 --- a/src/version.h +++ b/src/version.h @@ -36,13 +36,13 @@ #define APP_DESC "XMRigCC CPU miner" #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id" #endif -#define APP_VERSION "1.7.0 (based on XMRig)" +#define APP_VERSION "1.8.0_beta1 (based on XMRig)" #define APP_DOMAIN "" #define APP_SITE "https://github.com/Bendr0id/xmrigCC" #define APP_KIND "cpu" #define APP_VER_MAJOR 1 -#define APP_VER_MINOR 7 +#define APP_VER_MINOR 8 #define APP_VER_BUILD 0 #define APP_VER_REV 0 diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index e599b87f..15389fbb 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -140,7 +140,7 @@ void MultiWorker::start() *Job::nonce(m_state->blob + i * m_state->job.size()) = ++m_state->nonces[i]; } - CryptoNight::hash(m_hashFactor, m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads); + CryptoNight::hash(m_hashFactor, Options::i()->asmOptimization(), m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads); for (size_t i=0; i < m_hashFactor; ++i) { if (*reinterpret_cast(m_hash + 24 + i * 32) < m_state->job.target()) {