From 862c34b31eba56b6fb52266428ca0fa16745d248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ben=20Gr=C3=A4f?= Date: Tue, 5 Mar 2019 23:49:32 +0100 Subject: [PATCH] #1.9.0 (#236) # 1.9.0 - Integrated Monero CN-R variant so called CNv4, aka CN-R, aka CNv5, aka Cryptonight-R #233 (algo: "cryptonight", variant: "r") - Integrated Wownero CN-R variant #233 (algo: "cryptonight", variant: "wow") - Integrated Graft variant (algo: "cryptonight", variant: "rwz" OR variant: "graft") - Integrated X-Cash variant #234 (algo: "cryptonight", variant: "double" OR variant: "heavyx" OR variant: "xcash") - Integrated Zelerius variant (algo: "cryptonight", variant: "zls" OR variant: "zelerius") - Add miner version column to the Dashboard (version turns red when its outdated) - Fixed crash when remote logging is disabled --- CHANGELOG.md | 8 + CMakeLists.txt | 3 +- cmake/asm.cmake | 43 +- index.html | 16 + src/App.cpp | 8 +- src/Cpu_arm.cpp | 6 +- src/Mem.cpp | 10 +- src/Mem.h | 3 + src/Mem_unix.cpp | 16 + src/Mem_win.cpp | 10 + src/Options.cpp | 47 +- src/Options.h | 4 +- src/PowVariant.h | 25 + src/config.json | 3 +- src/crypto/CryptoNight.cpp | 609 +++-- src/crypto/CryptoNight.h | 28 +- src/crypto/CryptoNightR_gen.cpp | 190 ++ src/crypto/CryptoNight_arm.h | 2030 +++++++------- src/crypto/CryptoNight_test.h | 47 + src/crypto/CryptoNight_x86.h | 2324 +++++++++-------- .../asm/CryptonightR_soft_aes_template.inc | 279 ++ src/crypto/asm/CryptonightR_template.S | 1595 +++++++++++ src/crypto/asm/CryptonightR_template.h | 1087 ++++++++ src/crypto/asm/CryptonightR_template.inc | 529 ++++ .../asm/CryptonightWOW_soft_aes_template.inc | 266 ++ src/crypto/asm/CryptonightWOW_template.inc | 486 ++++ src/crypto/asm/cn_main_loop.S | 163 ++ .../asm/cnv2_double_main_loop_rwz_all.inc | 410 +++ src/crypto/asm/cnv2_main_loop_rwz_all.inc | 186 ++ .../win/CryptonightR_soft_aes_template.inc | 279 ++ .../CryptonightR_soft_aes_template_win.inc | 279 ++ src/crypto/asm/win/CryptonightR_template.S | 1595 +++++++++++ src/crypto/asm/win/CryptonightR_template.asm | 1585 +++++++++++ src/crypto/asm/win/CryptonightR_template.inc | 529 ++++ .../asm/win/CryptonightR_template_win.inc | 529 ++++ .../win/CryptonightWOW_soft_aes_template.inc | 266 ++ .../CryptonightWOW_soft_aes_template_win.inc | 266 ++ .../asm/win/CryptonightWOW_template.inc | 486 ++++ .../asm/win/CryptonightWOW_template_win.inc | 486 ++++ src/crypto/asm/win/cn_main_loop.asm | 87 + src/crypto/asm/win/cn_main_loop_win_gcc.S | 75 + .../asm/win/cnv2_double_main_loop_rwz_all.inc | 410 +++ src/crypto/asm/win/cnv2_main_loop_rwz_all.inc | 186 ++ src/crypto/variant4_random_math.h | 447 ++++ src/net/Client.cpp | 8 + src/net/Job.cpp | 1 + src/net/Job.h | 3 + src/net/Network.cpp | 4 +- src/version.h | 6 +- src/workers/MultiWorker.cpp | 2 +- 50 files changed, 15657 insertions(+), 2303 deletions(-) create mode 100644 src/crypto/CryptoNightR_gen.cpp create mode 100644 src/crypto/asm/CryptonightR_soft_aes_template.inc create mode 100644 src/crypto/asm/CryptonightR_template.S create mode 100644 src/crypto/asm/CryptonightR_template.h create mode 100644 src/crypto/asm/CryptonightR_template.inc create mode 100644 src/crypto/asm/CryptonightWOW_soft_aes_template.inc create mode 100644 src/crypto/asm/CryptonightWOW_template.inc create mode 100644 src/crypto/asm/cnv2_double_main_loop_rwz_all.inc create mode 100644 src/crypto/asm/cnv2_main_loop_rwz_all.inc create mode 100644 src/crypto/asm/win/CryptonightR_soft_aes_template.inc create mode 100644 src/crypto/asm/win/CryptonightR_soft_aes_template_win.inc create mode 100644 src/crypto/asm/win/CryptonightR_template.S create mode 100644 src/crypto/asm/win/CryptonightR_template.asm create mode 100644 src/crypto/asm/win/CryptonightR_template.inc create mode 100644 src/crypto/asm/win/CryptonightR_template_win.inc create mode 100644 src/crypto/asm/win/CryptonightWOW_soft_aes_template.inc create mode 100644 src/crypto/asm/win/CryptonightWOW_soft_aes_template_win.inc create mode 100644 src/crypto/asm/win/CryptonightWOW_template.inc create mode 100644 src/crypto/asm/win/CryptonightWOW_template_win.inc create mode 100644 src/crypto/asm/win/cnv2_double_main_loop_rwz_all.inc create mode 100644 src/crypto/asm/win/cnv2_main_loop_rwz_all.inc create mode 100644 src/crypto/variant4_random_math.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 18acfca9..7f0966ab 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,11 @@ +# 1.9.0 +- Integrated Monero CN-R variant so called CNv4, aka CN-R, aka CNv5, aka Cryptonight-R #233 (algo: "cryptonight", variant: "r") +- Integrated Wownero CN-R variant #233 (algo: "cryptonight", variant: "wow") +- Integrated Graft variant (algo: "cryptonight", variant: "rwz" OR variant: "graft") +- Integrated X-Cash variant #234 (algo: "cryptonight", variant: "double" OR variant: "heavyx" OR variant: "xcash") +- Integrated Zelerius variant (algo: "cryptonight", variant: "zls" OR variant: "zelerius") +- Add miner version column to the Dashboard (version turns red when its outdated) +- Fixed crash when remote logging is disabled # 1.8.13 - Integrated HOSP variant (algo: "cryptonight", variant: "hosp") - Added ASM code/optimization for HOSP and RTO on Intel CPUs diff --git a/CMakeLists.txt b/CMakeLists.txt index e96f979b..757b26b8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -54,6 +54,7 @@ set(SOURCES_CRYPTO src/crypto/c_jh.c src/crypto/c_skein.c src/crypto/CryptoNight.cpp + src/crypto/CryptoNightR_gen.cpp ) set(SOURCES_COMMON @@ -131,7 +132,7 @@ if (WIN32) add_definitions(-DBOOST_ALL_NO_LIB) endif(WIN32) -find_package(Boost 1.63.0 COMPONENTS system REQUIRED) +find_package(Boost 1.62.0 COMPONENTS system REQUIRED) include(cmake/flags.cmake) diff --git a/cmake/asm.cmake b/cmake/asm.cmake index abd4030c..b5067939 100644 --- a/cmake/asm.cmake +++ b/cmake/asm.cmake @@ -56,6 +56,40 @@ configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/ configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_fastv2_sandybridge.inc") configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc") +# CN XCASH +set(ALGO "xcash") +set(ITERATIONS "1048576") #0x100000 +set(MASK "2097136") #0x1FFFF0 + +configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_xcash_ivybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_xcash_bulldozer.inc") +configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_xcash_ryzen.inc") +configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_xcash_sandybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_xcash_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_xcash_ivybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_xcash_bulldozer.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_xcash_ryzen.inc") +configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_xcash_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_xcash_soft_aes_sandybridge.inc") + +# CN ZELERIUS +set(ALGO "zelerius") +set(ITERATIONS "393216") #0x60000 +set(MASK "2097136") #0x1FFFF0 + +configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_zelerius_ivybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_zelerius_bulldozer.inc") +configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_zelerius_ryzen.inc") +configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_zelerius_sandybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_zelerius_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_zelerius_ivybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_zelerius_bulldozer.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_zelerius_ryzen.inc") +configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_zelerius_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_zelerius_soft_aes_sandybridge.inc") + # CN LITE set(ALGO "lite") @@ -99,16 +133,19 @@ configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" " if (CMAKE_C_COMPILER_ID MATCHES MSVC) enable_language(ASM_MASM) - set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm") + set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm" + "src/crypto/asm/win/CryptonightR_template.asm") set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) include_directories(${CMAKE_BINARY_DIR}/src/crypto/asm/win) else() enable_language(ASM) if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) - set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S") + set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S" + "src/crypto/asm/win/CryptonightR_template.S") else() - set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S") + set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S" + "src/crypto/asm/CryptonightR_template.S") endif() set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) diff --git a/index.html b/index.html index 9a8d953c..ec011264 100644 --- a/index.html +++ b/index.html @@ -65,6 +65,9 @@ var currentServerTime = 0; var clockDrift = 0; + var latestVersion = 0; + var currentVersion = 0; + $.fn.dataTable.ext.search.push( function( settings, data, dataIndex ) { @@ -108,6 +111,7 @@ orderable: false }, {data: "client_status.client_id", render: clientInfo}, + {data: "client_status.version", render: version}, {data: "client_status.current_pool"}, {data: "client_status.current_status", render: clientStatus}, {data: "client_status.current_algo_name", render: algoAndPowVariantName}, @@ -674,6 +678,16 @@ } } + function version( data, type, row ) { + var clientVersion = parseInt(row.client_status.version.split('.').join("")); + + if (latestVersion > clientVersion) { + return '
' + data + '
'; + } else { + return data; + } + } + function clientStatus( data, type, row ) { var lastStatus = row.client_status.last_status_update * 1000; @@ -822,6 +836,7 @@ Miner Id + Version Pool Status Algo / PoW @@ -861,6 +876,7 @@ + diff --git a/src/App.cpp b/src/App.cpp index b240431e..45c30757 100644 --- a/src/App.cpp +++ b/src/App.cpp @@ -155,10 +155,14 @@ int App::start() return EINVAL; } else { if (Options::i()->colors()) { - LOG_INFO(WHITE_BOLD("%s hash self-test... ") GREEN_BOLD("successful") ".", m_options->algoName()); + LOG_INFO(WHITE_BOLD("%s hash self-test... %s."), + m_options->algoName(), + Options::i()->skipSelfCheck() ? YELLOW_BOLD("skipped") : GREEN_BOLD("successful")); } else { - LOG_INFO("%s hash self-test... successful.", m_options->algoName()); + LOG_INFO("%s hash self-test... %s.", + m_options->algoName(), + Options::i()->skipSelfCheck() ? "skipped" : "successful"); } } diff --git a/src/Cpu_arm.cpp b/src/Cpu_arm.cpp index db6ffa30..7be95170 100644 --- a/src/Cpu_arm.cpp +++ b/src/Cpu_arm.cpp @@ -30,7 +30,11 @@ void CpuImpl::initCommon() { - memcpy(m_brand, "Unknown", 7); +# ifdef XMRIG_ARMv8 + memcpy(m_brand, "ARMv8", 5); +# else + memcpy(m_brand, "ARMv7", 5); +# endif # if defined(XMRIG_ARMv8) m_flags |= Cpu::X86_64; diff --git a/src/Mem.cpp b/src/Mem.cpp index a9a233b4..cd82339c 100644 --- a/src/Mem.cpp +++ b/src/Mem.cpp @@ -67,9 +67,17 @@ ScratchPadMem Mem::create(ScratchPad** scratchPads, int threadId) allocate(scratchPadMem, m_useHugePages); for (size_t i = 0; i < getThreadHashFactor(threadId); ++i) { - ScratchPad* scratchPad = static_cast(_mm_malloc(sizeof(ScratchPad), 4096)); + auto* scratchPad = static_cast(_mm_malloc(sizeof(ScratchPad), 4096)); scratchPad->memory = scratchPadMem.memory + (i * scratchPadSize); + auto* p = reinterpret_cast(allocateExecutableMemory(0x4000)); + scratchPad->generated_code = reinterpret_cast(p); + scratchPad->generated_code_double = reinterpret_cast(p + 0x2000); + + scratchPad->generated_code_data.variant = PowVariant::LAST_ITEM; + scratchPad->generated_code_data.height = (uint64_t)(-1); + scratchPad->generated_code_double_data = scratchPad->generated_code_data; + scratchPads[i] = scratchPad; } diff --git a/src/Mem.h b/src/Mem.h index 790bdd7e..94f74b22 100644 --- a/src/Mem.h +++ b/src/Mem.h @@ -75,6 +75,9 @@ public: static ScratchPadMem create(ScratchPad** scratchPads, int threadId); static void release(ScratchPad** scratchPads, ScratchPadMem& scratchPadMem, int threadId); + static void *allocateExecutableMemory(size_t size); + static void flushInstructionCache(void *p, size_t size); + static inline size_t hashFactor() { return m_hashFactor; } static inline size_t getThreadHashFactor(int threadId) { diff --git a/src/Mem_unix.cpp b/src/Mem_unix.cpp index 8acac2fa..53309406 100644 --- a/src/Mem_unix.cpp +++ b/src/Mem_unix.cpp @@ -86,3 +86,19 @@ void Mem::release(ScratchPadMem &scratchPadMem) _mm_free(scratchPadMem.memory); } } + +void *Mem::allocateExecutableMemory(size_t size) +{ +# if defined(__APPLE__) + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANON, -1, 0); +# else + return mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); +# endif +} + +void Mem::flushInstructionCache(void *p, size_t size) +{ +# ifndef __FreeBSD__ + __builtin___clear_cache(reinterpret_cast(p), reinterpret_cast(p) + size); +# endif +} diff --git a/src/Mem_win.cpp b/src/Mem_win.cpp index 1a8e582d..94ad8e06 100644 --- a/src/Mem_win.cpp +++ b/src/Mem_win.cpp @@ -182,4 +182,14 @@ void Mem::release(ScratchPadMem &scratchPadMem) else { _mm_free(scratchPadMem.memory); } +} + +void *Mem::allocateExecutableMemory(size_t size) +{ + return VirtualAlloc(0, size, MEM_COMMIT | MEM_RESERVE, PAGE_EXECUTE_READWRITE); +} + +void Mem::flushInstructionCache(void *p, size_t size) +{ + ::FlushInstructionCache(GetCurrentProcess(), p, size); } \ No newline at end of file diff --git a/src/Options.cpp b/src/Options.cpp index 87405c4a..3619b293 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -73,7 +73,7 @@ Options:\n" -k, --keepalive send keepalived for prevent timeout (need pool support)\n\ -r, --retries=N number of times to retry before switch to backup server (default: 5)\n\ -R, --retry-pause=N time to pause between retries (default: 5)\n\ - --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for > v5), 'rto', 'xfh', 'upx', 'turtle', 'hosp'\n\ + --pow-variant=V specificy the PoW variat to use: \n'auto' (default), '0', '1', '2', 'ipbc', 'xao', 'xtl', 'rto', 'xfh', 'upx', 'turtle', 'hosp', 'r', 'wow', 'double (xcash)', 'zls' (zelerius), 'rwz' (graft)\n\ for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\ --asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\ --multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\ @@ -92,7 +92,8 @@ Options:\n" --api-access-token=T access token for API\n\ --api-worker-id=ID custom worker-id for API\n\ --reboot-cmd command/bat to execute to Reboot miner\n\ - --force-pow-variant disable pow/variant parsing from pool\n" + --force-pow-variant skip pow/variant parsing from pool\n\ + --skip-self-check skip self check on startup\n" # ifndef XMRIG_NO_CC "\ --cc-url=URL url of the CC Server\n\ @@ -179,6 +180,7 @@ static struct option const options[] = { { "force-pow-variant", 0, nullptr, 1016 }, { "pow-variant", 1, nullptr, 1017 }, { "variant", 1, nullptr, 1017 }, + { "skip-self-check", 0, nullptr, 1018 }, { "api-port", 1, nullptr, 4000 }, { "api-access-token", 1, nullptr, 4001 }, { "api-worker-id", 1, nullptr, 4002 }, @@ -237,6 +239,7 @@ static struct option const config_options[] = { { "force-pow-variant", 0, nullptr, 1016 }, { "pow-variant", 1, nullptr, 1017 }, { "variant", 1, nullptr, 1017 }, + { "skip-self-check", 0, nullptr, 1018 }, { "doublehash-thread-mask", 1, nullptr, 4013 }, { "multihash-thread-mask", 1, nullptr, 4013 }, { "asm-optimization", 1, nullptr, 4020 }, @@ -331,7 +334,10 @@ constexpr static const char *pow_variant_names[] = { "fast2", "upx", "turtle", - "hosp" + "hosp", + "wow", + "r", + "xcash" }; constexpr static const char *asm_optimization_names[] = { @@ -380,6 +386,7 @@ Options::Options(int argc, char **argv) : m_ccPushPeriodicStatus(false), m_ccPushZeroHashrateMiners(false), m_forcePowVariant(false), + m_skipSelfCheck(false), m_fileName(Platform::defaultConfigName()), m_apiToken(nullptr), m_apiWorkerId(nullptr), @@ -643,11 +650,14 @@ bool Options::parseArg(int key, const char *arg) return parseBoolean(key, true); case 1016: /* --force-pow-variant */ - return parseBoolean(key, false); + return parseBoolean(key, true); case 1017: /* --pow-variant/--variant */ return parsePowVariant(arg); + case 1018: /* --skip-self-check */ + return parseBoolean(key, true); + case 4016: /* --cc-use-tls */ return parseBoolean(key, true); @@ -912,6 +922,10 @@ bool Options::parseBoolean(int key, bool enable) m_forcePowVariant = enable; break; + case 1018: /* --skip-self-check */ + m_skipSelfCheck = enable; + break; + case 2000: /* --colors */ m_colors = enable; break; @@ -1206,6 +1220,31 @@ bool Options::parsePowVariant(const char *powVariant) break; } + if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "wow")) { + m_powVariant = POW_WOW; + break; + } + + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "4") || !strcmp(powVariant, "r") || !strcmp(powVariant, "cnv4") || !strcmp(powVariant, "cnv5"))) { + m_powVariant = POW_V4; + break; + } + + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "xcash") || !strcmp(powVariant, "heavyx") || !strcmp(powVariant, "double"))) { + m_powVariant = POW_DOUBLE; + break; + } + + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "zelerius") || !strcmp(powVariant, "zls") || !strcmp(powVariant, "zlx"))) { + m_powVariant = POW_ZELERIUS; + break; + } + + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "rwz") || !strcmp(powVariant, "graft"))) { + m_powVariant = POW_RWZ; + break; + } + if (i == ARRAY_SIZE(pow_variant_names) - 1) { showUsage(1); return false; diff --git a/src/Options.h b/src/Options.h index a02044f6..902eed3d 100644 --- a/src/Options.h +++ b/src/Options.h @@ -84,7 +84,8 @@ public: inline bool ccPushZeroHashrateMiners() const { return m_ccPushZeroHashrateMiners; } inline bool ccUsePushover() const { return ccPushoverUser() && ccPushoverToken(); } inline bool ccUseTelegram() const { return ccTelegramBotToken() && ccTelegramChatId(); } - inline bool forcePowVariant() const { return m_forcePowVariant; }; + inline bool forcePowVariant() const { return m_forcePowVariant; } + inline bool skipSelfCheck() const { return m_skipSelfCheck; } inline const char *fileName() const { return m_fileName; } inline const char *apiToken() const { return m_apiToken; } inline const char *apiWorkerId() const { return m_apiWorkerId; } @@ -171,6 +172,7 @@ private: bool m_ccPushPeriodicStatus; bool m_ccPushZeroHashrateMiners; bool m_forcePowVariant; + bool m_skipSelfCheck; const char* m_fileName; char *m_apiToken; char *m_apiWorkerId; diff --git a/src/PowVariant.h b/src/PowVariant.h index a03fbd22..17ddec11 100644 --- a/src/PowVariant.h +++ b/src/PowVariant.h @@ -39,6 +39,11 @@ enum PowVariant POW_UPX, POW_TURTLE, POW_HOSP, + POW_WOW, + POW_V4, + POW_DOUBLE, + POW_ZELERIUS, + POW_RWZ, LAST_ITEM }; @@ -74,6 +79,16 @@ inline std::string getPowVariantName(PowVariant powVariant) return "turtle"; case POW_HOSP: return "hosp"; + case POW_WOW: + return "wow"; + case POW_V4: + return "r"; + case POW_DOUBLE: + return "double"; + case POW_ZELERIUS: + return "zls"; + case POW_RWZ: + return "rwz"; case POW_AUTODETECT: default: return "-1"; @@ -149,6 +164,16 @@ inline PowVariant parseVariant(const std::string variant) powVariant = PowVariant::POW_TURTLE; } else if (variant == "hosp" || variant == "hospital") { powVariant = PowVariant::POW_HOSP; + } else if (variant == "wow" || variant == "wownero") { + powVariant = PowVariant::POW_WOW; + } else if (variant == "r" || variant == "4" || variant == "cnv4" || variant == "cnv5") { + powVariant = PowVariant::POW_V4; + } else if (variant == "xcash" || variant == "heavyx" || variant == "double") { + powVariant = PowVariant::POW_DOUBLE; + } else if (variant == "zelerius" || variant == "zls" || variant == "zlx") { + powVariant = PowVariant::POW_ZELERIUS; + } else if (variant == "rwz" || variant == "graft") { + powVariant = PowVariant::POW_RWZ; } return powVariant; diff --git a/src/config.json b/src/config.json index 50f1f9ca..c7a89a1f 100644 --- a/src/config.json +++ b/src/config.json @@ -4,7 +4,7 @@ "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) - "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx, turtle, hosp + "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), '0', '1', '2', 'ipbc', 'xao', 'xtl', 'rto', 'xfh', 'upx', 'turtle', 'hosp', 'r', 'wow', 'double (xcash)', 'zls' (zelerius), 'rwz' (graft) // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) @@ -21,6 +21,7 @@ "syslog": false, // use system log for output messages "reboot-cmd" : "", // command to execute to reboot the OS "force-pow-variant" : false, // force pow variant, dont parse pow/variant from pool job + "skip-self-check" : false, // skip the self check on startup "pools": [ { "url": "donate2.graef.in:80", // URL of mining server diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index 0040a164..cd4b6699 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -23,6 +23,7 @@ * along with this program. If not, see . */ +#include #include "crypto/CryptoNight.h" #if defined(XMRIG_ARM) @@ -34,282 +35,398 @@ #include "crypto/CryptoNight_test.h" template -static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) - if (powVersion == PowVariant::POW_V1) { + if (variant == PowVariant::POW_V1) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V1, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } #endif - } else if (powVersion == PowVariant::POW_V2) { + } else if (variant == PowVariant::POW_V2) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V2, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif -} else if (powVersion == PowVariant::POW_ALLOY) { - CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); -} else if (powVersion == PowVariant::POW_XTL) { + } else if (variant == PowVariant::POW_V4) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V4, NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); +#else + if (asmOptimization != AsmOptimization::ASM_OFF && NUM_HASH_BLOCKS <= 2) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V4, NUM_HASH_BLOCKS>::hashPowV4_asm(input, size, output, scratchPad, height, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V4, NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); + } +#endif + } else if (variant == PowVariant::POW_WOW) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_WOW, NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); +#else + if (asmOptimization != AsmOptimization::ASM_OFF && NUM_HASH_BLOCKS <= 2) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_WOW, NUM_HASH_BLOCKS>::hashPowV4_asm(input, size, output, scratchPad, height, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_WOW, NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); + } +#endif + } else if (variant == PowVariant::POW_ALLOY) { + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_ALLOY, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); + } else if (variant == PowVariant::POW_XTL) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_XTL, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_XTL, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_XTL, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } #endif -} else if (powVersion == PowVariant::POW_FAST_2) { + } else if (variant == PowVariant::POW_FAST_2) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_FAST_2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_FAST_2, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_FAST_2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif -} else if (powVersion == PowVariant::POW_MSR) { + } else if (variant == PowVariant::POW_DOUBLE) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_DOUBLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || + (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || + (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_DOUBLE, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_DOUBLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_ZELERIUS) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_ZELERIUS, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || + (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || + (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_ZELERIUS, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_ZELERIUS, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_RWZ) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false,POW_RWZ, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if ((asmOptimization != AsmOptimization::ASM_OFF && NUM_HASH_BLOCKS <= 2)) { + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_RWZ, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_RWZ, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_MSR) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_MSR, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_MSR, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_MSR, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } #endif -} else if (powVersion == PowVariant::POW_RTO || powVersion == PowVariant::POW_HOSP) { + } else if (variant == PowVariant::POW_RTO) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_RTO, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_RTO, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_RTO, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); } #endif -} else if (powVersion == PowVariant::POW_XFH) { - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); -} else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); -} -# endif -} - -template -static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { - if (powVersion == PowVariant::POW_V1) { + } else if (variant == PowVariant::POW_HOSP) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_HOSP, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_HOSP, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_HOSP, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); } #endif - } else if (powVersion == PowVariant::POW_V2) { -#if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); -#else - if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization, powVersion); - } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); - } -#endif - } else if (powVersion == PowVariant::POW_FAST_2) { -#if defined(XMRIG_ARM) - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); -#else - if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization, powVersion); - } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); - } -#endif - } else if (powVersion == PowVariant::POW_ALLOY) { - CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); - } else if (powVersion == PowVariant::POW_XTL) { -#if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); -#else - if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); - } else { - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); - } -#endif - } else if (powVersion == PowVariant::POW_MSR) { -#if defined(XMRIG_ARM) - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); -#else - if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); - } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); - } -#endif - } else if (powVersion == PowVariant::POW_RTO || powVersion == PowVariant::POW_HOSP) { -#if defined(XMRIG_ARM) - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); -#else - if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); - } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); - } -#endif - } else if (powVersion == PowVariant::POW_XFH) { - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); + } else if (variant == PowVariant::POW_XFH) { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_XFH, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); - } -} - -template -static void cryptonight_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { -# if !defined(XMRIG_ARMv7) - if (powVersion == PowVariant::POW_V1) { -#if defined(XMRIG_ARM) - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); -#else - if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); - } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); - } -#endif - } else if (powVersion == PowVariant::POW_TUBE) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); - } else if (powVersion == PowVariant::POW_UPX) { -#if defined(XMRIG_ARM) - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); -#else - if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); - } else { - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); - } -#endif - } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, POW_V0, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } # endif } template -static void cryptonight_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { - if (powVersion == PowVariant::POW_V1) { +static void cryptonight_softaes(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { + if (variant == PowVariant::POW_V1) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V1, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } #endif - } else if (powVersion == PowVariant::POW_TUBE) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); - } else if (powVersion == PowVariant::POW_UPX) { + } else if (variant == PowVariant::POW_V2) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V2, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif + } else if (variant == PowVariant::POW_V4) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V4, NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); +#else + if (asmOptimization != AsmOptimization::ASM_OFF && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V4, NUM_HASH_BLOCKS>::hashPowV4_asm(input, size, output, scratchPad, height, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V4, NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); + } +#endif + } else if (variant == PowVariant::POW_WOW) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_WOW,NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); +#else + if (asmOptimization != AsmOptimization::ASM_OFF && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_WOW, NUM_HASH_BLOCKS>::hashPowV4_asm(input, size, output, scratchPad, height, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_WOW, NUM_HASH_BLOCKS>::hashPowV4(input, size, output, scratchPad, height); + } +#endif + } else if (variant == PowVariant::POW_FAST_2) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_FAST_2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_FAST_2, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_FAST_2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_DOUBLE) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_DOUBLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_DOUBLE, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_DOUBLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_ZELERIUS) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_ZELERIUS, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_ZELERIUS, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_ZELERIUS, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_RWZ) { + CryptoNightMultiHash<0x60000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_RWZ, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } else if (variant == PowVariant::POW_ALLOY) { + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_ALLOY, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); + } else if (variant == PowVariant::POW_XTL) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_XTL, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_XTL, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_XTL, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_MSR) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_MSR, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_MSR, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_MSR, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_RTO) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_RTO, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_RTO, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_RTO, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_HOSP) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_HOSP, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_HOSP, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_HOSP, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_XFH) { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_XFH, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, POW_V0, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } } template -static void cryptonight_super_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_lite_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +# if !defined(XMRIG_ARMv7) + if (variant == PowVariant::POW_V1) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_V1, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_TUBE, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + } else if (variant == PowVariant::POW_UPX) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_UPX, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_UPX, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_UPX, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, POW_V0, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); + } +# endif +} + +template +static void cryptonight_lite_softaes(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { + if (variant == PowVariant::POW_V1) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_V1, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_V1, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else if (variant == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_TUBE, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + } else if (variant == PowVariant::POW_UPX) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_UPX, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_UPX, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_UPX, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, POW_V0, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); + } +} + +template +static void cryptonight_super_lite_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { } template -static void cryptonight_super_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_super_lite_softaes(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { } template -static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, POW_TURTLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { - CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, POW_TURTLE, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, POW_TURTLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif # endif } template -static void cryptonight_ultra_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_ultra_lite_softaes(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { #if defined(XMRIG_ARM) - CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, POW_TURTLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization, powVersion); + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, POW_TURTLE, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { - CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, POW_TURTLE, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif } template -static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) - if (powVersion == PowVariant::POW_XHV) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); + if (variant == PowVariant::POW_XHV) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, POW_XHV, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } - else if (powVersion == PowVariant::POW_TUBE) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad); + else if (variant == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, POW_TUBE, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, POW_V0, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } # endif } template -static void cryptonight_heavy_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { - if (powVersion == PowVariant::POW_XHV) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); +static void cryptonight_heavy_softaes(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { + if (variant == PowVariant::POW_XHV) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, POW_XHV, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } - else if (powVersion == PowVariant::POW_TUBE) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad); + else if (variant == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, POW_TUBE, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, POW_V0, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } } -void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad); +void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad); template void setCryptoNightHashMethods(Options::Algo algo, bool aesni) @@ -377,15 +494,16 @@ bool CryptoNight::init(int algo, bool aesni) } setCryptoNightHashMethods(static_cast(algo), aesni); - return selfTest(algo); + + return Options::i()->skipSelfCheck() ? true : selfCheck(algo); } -void CryptoNight::hash(size_t factor, AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) +void CryptoNight::hash(size_t factor, AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { - cryptonight_hash_ctx[factor-1](asmOptimization, powVersion, input, size, output, scratchPad); + cryptonight_hash_ctx[factor-1](asmOptimization, height, variant, input, size, output, scratchPad); } -bool CryptoNight::selfTest(int algo) +bool CryptoNight::selfCheck(int algo) { if (cryptonight_hash_ctx[0] == nullptr #if MAX_NUM_HASH_BLOCKS > 1 @@ -413,6 +531,14 @@ bool CryptoNight::selfTest(int algo) ScratchPad* scratchPad = static_cast(_mm_malloc(sizeof(ScratchPad), 4096)); scratchPad->memory = (uint8_t *) _mm_malloc(MEMORY * 6, 16); + auto* p = reinterpret_cast(Mem::allocateExecutableMemory(0x4000)); + scratchPad->generated_code = reinterpret_cast(p); + scratchPad->generated_code_double = reinterpret_cast(p + 0x2000); + + scratchPad->generated_code_data.variant = PowVariant::LAST_ITEM; + scratchPad->generated_code_data.height = (uint64_t)(-1); + scratchPad->generated_code_double_data = scratchPad->generated_code_data; + scratchPads[i] = scratchPad; } @@ -427,129 +553,128 @@ bool CryptoNight::selfTest(int algo) if (algo == Options::ALGO_CRYPTONIGHT_HEAVY) { // cn-heavy - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 96) == 0; #endif // cn-heavy haven - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 96) == 0; #endif // cn-heavy bittube - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 96) == 0; #endif } else if (algo == Options::ALGO_CRYPTONIGHT_LITE) { // cn-lite v0 - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 160) == 0; #endif // cn-lite v7 tests - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); - resultLite = resultLite && memcmp(output, test_output_v1_lite, 32) == 0; + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); + resultLite = resultLite && memcmp(output, test_output_v1_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 160) == 0; #endif - // cn-lite ibpc tests - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, 0, PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 160) == 0; #endif // cn-lite upx - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_UPX, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_UPX, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0; } else if (algo == Options::ALGO_CRYPTONIGHT_SUPERLITE) { @@ -559,123 +684,173 @@ bool CryptoNight::selfTest(int algo) } else if (algo == Options::ALGO_CRYPTONIGHT_ULTRALITE) { // cn ultralite (cnv8 + turtle) - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_TURTLE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_TURTLE, test_input, 76, output, scratchPads); resultUltraLite = resultUltraLite && memcmp(output, test_output_turtle, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_TURTLE, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_TURTLE, test_input, 76, output, scratchPads); resultUltraLite = resultUltraLite && memcmp(output, test_output_turtle, 64) == 0; #endif } else { // cn v0 aka orignal - - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0,test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V0,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, 0, PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 160) == 0; #endif // cn v7 aka cnv1 - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, 0, PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 160) == 0; #endif // cnv7 + xtl - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_XTL,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 32) == 0; // cnv7 + msr aka cn-fast - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_MSR,test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_MSR,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_msr, 32) == 0; // cnv7 + alloy - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_ALLOY,test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_ALLOY,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_alloy, 32) == 0; // cnv7 + hosp/rto - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_HOSP,test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_HOSP,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_hosp, 32) == 0; // cnv8 aka cnv2 - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v2, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_V2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v2, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[2](asmOptimization, 0, PowVariant::POW_V2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v2, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[3](asmOptimization, 0, PowVariant::POW_V2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v2, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[4](asmOptimization, 0, PowVariant::POW_V2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v2, 160) == 0; #endif // cn xfh aka cn-heavy-superfast - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_XFH, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xfh, 32) == 0; // cnv8 + xtl aka cn-fast2 - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl_v9, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl_v9, 64) == 0; #endif + + // cnv8 + xcash + + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_DOUBLE, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_xcash, 32) == 0; + + // cnv8 + zelerius + + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_ZELERIUS, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_zelerius, 32) == 0; + + // cnv8 + rwz + + cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_RWZ, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_rwz, 32) == 0; + + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_RWZ, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_rwz, 64) == 0; + #endif + + // cnv9 aka cnv4 aka cnv5 aka cnr + + cryptonight_hash_ctx[0](asmOptimization, 10000, PowVariant::POW_V4, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v4, 32) == 0; + + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](asmOptimization, 10000, PowVariant::POW_V4, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v4, 64) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 2 + cryptonight_hash_ctx[2](asmOptimization, 10000, PowVariant::POW_V4, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v4, 96) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 3 + cryptonight_hash_ctx[3](asmOptimization, 10000, PowVariant::POW_V4, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v4, 128) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 4 + cryptonight_hash_ctx[4](asmOptimization, 10000, PowVariant::POW_V4, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v4, 160) == 0; + #endif + + cryptonight_hash_ctx[0](asmOptimization, 10001, PowVariant::POW_V4, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v4_1, 32) == 0; + + cryptonight_hash_ctx[0](asmOptimization, 10002, PowVariant::POW_V4, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_v4_2, 32) == 0; } for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) { diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h index 10415ca9..aaf29145 100644 --- a/src/crypto/CryptoNight.h +++ b/src/crypto/CryptoNight.h @@ -42,8 +42,25 @@ #define POW_DEFAULT_INDEX_SHIFT 3 #define POW_XLT_V4_INDEX_SHIFT 4 +#if defined _MSC_VER || defined XMRIG_ARM +#define ABI_ATTRIBUTE +#else +#define ABI_ATTRIBUTE __attribute__((ms_abi)) +#endif + +struct ScratchPad; +typedef void(*cn_mainloop_fun_ms_abi)(ScratchPad*) ABI_ATTRIBUTE; +typedef void(*cn_mainloop_double_fun_ms_abi)(ScratchPad*, ScratchPad*) ABI_ATTRIBUTE; + +struct cryptonight_r_data { + int variant; + uint64_t height; + + bool match(const int v, const uint64_t h) const { return (v == variant) && (h == height); } +}; + struct ScratchPad { - alignas(16) uint8_t state[224]; // 224 instead of 200 to maintain aligned to 16 byte boundaries + alignas(16) uint8_t state[224]; alignas(16) uint8_t* memory; // Additional stuff for asm impl @@ -51,6 +68,11 @@ struct ScratchPad { const void* input; uint8_t* variant_table; const uint32_t* t_fn; + + cn_mainloop_fun_ms_abi generated_code; + cn_mainloop_double_fun_ms_abi generated_code_double; + cryptonight_r_data generated_code_data; + cryptonight_r_data generated_code_double_data; }; alignas(64) static uint8_t variant1_table[256]; @@ -63,12 +85,12 @@ class CryptoNight { public: static bool init(int algo, bool aesni); - static void hash(size_t factor, AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads); + static void hash(size_t factor, AsmOptimization asmOptimization, uint64_t height, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads); public: private: - static bool selfTest(int algo); + static bool selfCheck(int algo); }; diff --git a/src/crypto/CryptoNightR_gen.cpp b/src/crypto/CryptoNightR_gen.cpp new file mode 100644 index 00000000..d856cade --- /dev/null +++ b/src/crypto/CryptoNightR_gen.cpp @@ -0,0 +1,190 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2018-2019 SChernykh + * Copyright 2016-2019 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include + +typedef void(*void_func)(); + +#include "crypto/asm/CryptonightR_template.h" +#include "Mem.h" + +#if !defined XMRIG_ARM && !defined XMRIG_NO_ASM + +#include "crypto/CryptoNight_x86.h" + +static inline void add_code(uint8_t* &p, void (*p1)(), void (*p2)()) +{ + const ptrdiff_t size = reinterpret_cast(p2) - reinterpret_cast(p1); + if (size > 0) { + memcpy(p, reinterpret_cast(p1), size); + p += size; + } +} + +static inline void add_random_math(uint8_t* &p, const V4_Instruction* code, int code_size, const void_func* instructions, const void_func* instructions_mov, bool is_64_bit, AsmOptimization ASM) +{ + uint32_t prev_rot_src = (uint32_t)(-1); + + for (int i = 0;; ++i) { + const V4_Instruction inst = code[i]; + if (inst.opcode == RET) { + break; + } + + uint8_t opcode = (inst.opcode == MUL) ? inst.opcode : (inst.opcode + 2); + uint8_t dst_index = inst.dst_index; + uint8_t src_index = inst.src_index; + + const uint32_t a = inst.dst_index; + const uint32_t b = inst.src_index; + const uint8_t c = opcode | (dst_index << V4_OPCODE_BITS) | (((src_index == 8) ? dst_index : src_index) << (V4_OPCODE_BITS + V4_DST_INDEX_BITS)); + + switch (inst.opcode) { + case ROR: + case ROL: + if (b != prev_rot_src) { + prev_rot_src = b; + add_code(p, instructions_mov[c], instructions_mov[c + 1]); + } + break; + } + + if (a == prev_rot_src) { + prev_rot_src = (uint32_t)(-1); + } + + void_func begin = instructions[c]; + + if ((ASM = ASM_BULLDOZER) && (inst.opcode == MUL) && !is_64_bit) { + // AMD Bulldozer has latency 4 for 32-bit IMUL and 6 for 64-bit IMUL + // Always use 32-bit IMUL for AMD Bulldozer in 32-bit mode - skip prefix 0x48 and change 0x49 to 0x41 + uint8_t* prefix = reinterpret_cast(begin); + + if (*prefix == 0x49) { + *(p++) = 0x41; + } + + begin = reinterpret_cast(prefix + 1); + } + + add_code(p, begin, instructions[c + 1]); + + if (inst.opcode == ADD) { + *(uint32_t*)(p - sizeof(uint32_t) - (is_64_bit ? 3 : 0)) = inst.C; + if (is_64_bit) { + prev_rot_src = (uint32_t)(-1); + } + } + } +} + +void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightWOW_template_part1, CryptonightWOW_template_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightWOW_template_part2, CryptonightWOW_template_part3); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightWOW_template_mainloop) - ((const uint8_t*)CryptonightWOW_template_part1)) - (p - p0)); + add_code(p, CryptonightWOW_template_part3, CryptonightWOW_template_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} + +void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightR_template_part1, CryptonightR_template_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightR_template_part2, CryptonightR_template_part3); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightR_template_mainloop) - ((const uint8_t*)CryptonightR_template_part1)) - (p - p0)); + add_code(p, CryptonightR_template_part3, CryptonightR_template_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} + +void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightWOW_template_double_part1, CryptonightWOW_template_double_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightWOW_template_double_part2, CryptonightWOW_template_double_part3); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightWOW_template_double_part3, CryptonightWOW_template_double_part4); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightWOW_template_double_mainloop) - ((const uint8_t*)CryptonightWOW_template_double_part1)) - (p - p0)); + add_code(p, CryptonightWOW_template_double_part4, CryptonightWOW_template_double_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} + +void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightR_template_double_part1, CryptonightR_template_double_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightR_template_double_part2, CryptonightR_template_double_part3); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightR_template_double_part3, CryptonightR_template_double_part4); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightR_template_double_mainloop) - ((const uint8_t*)CryptonightR_template_double_part1)) - (p - p0)); + add_code(p, CryptonightR_template_double_part4, CryptonightR_template_double_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} + +void wow_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightWOW_soft_aes_template_part1, CryptonightWOW_soft_aes_template_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightWOW_soft_aes_template_part2, CryptonightWOW_soft_aes_template_part3); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightWOW_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightWOW_soft_aes_template_part1)) - (p - p0)); + add_code(p, CryptonightWOW_soft_aes_template_part3, CryptonightWOW_soft_aes_template_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} + +void v4_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM) +{ + uint8_t* p0 = reinterpret_cast(machine_code); + uint8_t* p = p0; + + add_code(p, CryptonightR_soft_aes_template_part1, CryptonightR_soft_aes_template_part2); + add_random_math(p, code, code_size, instructions, instructions_mov, false, ASM); + add_code(p, CryptonightR_soft_aes_template_part2, CryptonightR_soft_aes_template_part3); + *(int*)(p - 4) = static_cast((((const uint8_t*)CryptonightR_soft_aes_template_mainloop) - ((const uint8_t*)CryptonightR_soft_aes_template_part1)) - (p - p0)); + add_code(p, CryptonightR_soft_aes_template_part3, CryptonightR_soft_aes_template_end); + + Mem::flushInstructionCache(machine_code, p - p0); +} +#endif \ No newline at end of file diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index b0e31ae6..6c3b2fd7 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -36,11 +36,26 @@ #endif +#define SWAP32LE(x) x +#define SWAP64LE(x) x +#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length)) + +#ifndef NOINLINE +#ifdef __GNUC__ +#define NOINLINE __attribute__ ((noinline)) +#elif _MSC_VER +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif +#endif + #include #include #include "crypto/CryptoNight.h" #include "crypto/soft_aes.h" +#include "variant4_random_math.h" extern "C" @@ -111,11 +126,11 @@ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i #define EXTRACT64(X) _mm_cvtsi128_si64(X) -# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax) \ +# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax, reverse) \ { \ - const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x10))); \ + const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((l) + ((idx) ^ (reverse ? 0x30 : 0x10)))); \ const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x20))); \ - const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x30))); \ + const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ (reverse ? 0x10 : 0x30)))); \ vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx1))); \ vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx0))); \ vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(ax))); \ @@ -136,18 +151,52 @@ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i sqrt_result##idx += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \ } -# define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi) \ +# define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi, reverse) \ { \ const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \ const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x20))); \ const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x30))); \ hi ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[0]; \ lo ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[1]; \ + if (reverse) { \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx1))); \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx0))); \ + } else { \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx1))); \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx0))); \ + } \ + vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(ax))); \ +} + +# define SHUFFLE_V4(l, idx, bx0, bx1, ax, cx) \ +{ \ + const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x10))); \ + const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x20))); \ + const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x30))); \ vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx1))); \ vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx0))); \ vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(ax))); \ + cx = veorq_u64(veorq_u64(cx, chunk3), veorq_u64(chunk1, chunk2)); \ } +# define VARIANT4_RANDOM_MATH_INIT(idx, h) \ + uint32_t r##idx[9]; \ + struct V4_Instruction code##idx[256]; \ + r##idx[0] = (uint32_t)(h[12]); \ + r##idx[1] = (uint32_t)(h[12] >> 32); \ + r##idx[2] = (uint32_t)(h[13]); \ + r##idx[3] = (uint32_t)(h[13] >> 32); \ + v4_random_math_init(code##idx, VARIANT, height); + +# define VARIANT4_RANDOM_MATH(idx, al, ah, cl, bx0, bx1) \ + cl ^= (r##idx[0] + r##idx[1]) | ((uint64_t)(r##idx[2] + r##idx[3]) << 32); \ + r##idx[4] = static_cast(al); \ + r##idx[5] = static_cast(ah); \ + r##idx[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ + r##idx[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ + r##idx[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ + v4_random_math(code##idx, r##idx); \ + #if defined (__arm64__) || defined (__aarch64__) static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi) @@ -640,8 +689,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou _mm_store_si128(output + 11, xout7); } -// n-Loop version. Seems to be little bit slower then the hardcoded one. -template +template class CryptoNightMultiHash { public: @@ -650,79 +698,7 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, - scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - bx[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + //dummy } inline static void hashPowV2(const uint8_t* __restrict__ input, @@ -730,200 +706,24 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, - 200); - tweak1_2[hashBlock] = (*reinterpret_cast(input + 35 + hashBlock * size) ^ - *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - bx[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + //dummy } - // multi inline static void hashPowV3(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t sqrt_result[NUM_HASH_BLOCKS]; - uint64_t division_result_xmm[NUM_HASH_BLOCKS]; - __m128i bx0[NUM_HASH_BLOCKS]; - __m128i bx1[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; + //dummy + } - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, - scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx0[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - bx1[hashBlock] = _mm_set_epi64x(h[hashBlock][9] ^ h[hashBlock][11], h[hashBlock][8] ^ h[hashBlock][10]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - - division_result_xmm[hashBlock] = h[hashBlock][12]; - sqrt_result[hashBlock] = h[hashBlock][13]; - } - - uint64_t sqrt_result0; - uint64_t division_result_xmm0; - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - SHUFFLE_PHASE_1(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock]) - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx0[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - - sqrt_result0 = sqrt_result[hashBlock]; - division_result_xmm0 = division_result_xmm[hashBlock]; - - INTEGER_MATH_V2(0, cl, cx[hashBlock]) - - sqrt_result[hashBlock] = sqrt_result0; - division_result_xmm[hashBlock] = division_result_xmm0; - - lo = __umul128(idx[hashBlock], cl, &hi); - - SHUFFLE_PHASE_2(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock], lo, hi) - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - bx1[hashBlock] = bx0[hashBlock]; - bx0[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + // dummy } inline static void hashLiteTube(const uint8_t* __restrict__ input, @@ -931,87 +731,7 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, - 200); - tweak1_2[hashBlock] = (*reinterpret_cast(input + 35 + hashBlock * size) ^ - *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = - _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; - - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_set_epi64x(ah[hashBlock], al[hashBlock])); - } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); - } - - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); - - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*) &l[hashBlock][idx[hashBlock] & - MASK])[0]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + //dummy } inline static void hashHeavy(const uint8_t* __restrict__ input, @@ -1019,161 +739,7 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, - scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; - - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_set_epi64x(ah[hashBlock], al[hashBlock])); - } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); - } - - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); - - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - const int64x2_t x = vld1q_s64(reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])); - const int64_t n = vgetq_lane_s64(x, 0); - const int32_t d = vgetq_lane_s32(x, 2); - const int64_t q = n / (d | 0x5); - - ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - - idx[hashBlock] = d ^ q; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } - } - - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, - scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; - - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_set_epi64x(ah[hashBlock], al[hashBlock])); - } else { - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); - } - - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx)); - - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - const int64x2_t x = vld1q_s64(reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])); - const int64_t n = vgetq_lane_s64(x, 0); - const int32_t d = vgetq_lane_s32(x, 2); - const int64_t q = n / (d | 0x5); - - ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - - idx[hashBlock] = (~d) ^ q; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + //dummy } inline static void hashHeavyTube(const uint8_t* __restrict__ input, @@ -1181,125 +747,12 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, - 200); - tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + - hashBlock * size) ^ - *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - union alignas(16) - { - uint32_t k[4]; - uint64_t v64[2]; - }; - alignas(16) uint32_t x[4]; - -#define BYTE(p, i) ((unsigned char*)&p)[i] - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - __m128i cx; - - cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); - - const __m128i& key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - _mm_store_si128((__m128i*) k, key); - cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); - _mm_store_si128((__m128i*) x, cx); - - k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ - saes_table[3][BYTE(x[3], 3)]; - x[0] ^= k[0]; - k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ - saes_table[3][BYTE(x[0], 3)]; - x[1] ^= k[1]; - k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ - saes_table[3][BYTE(x[1], 3)]; - x[2] ^= k[2]; - k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ - saes_table[3][BYTE(x[2], 3)]; - - cx = _mm_load_si128((__m128i*) k); - - _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); - - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx[hashBlock] = EXTRACT64(cx); - bx[hashBlock] = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*) &l[hashBlock][idx[hashBlock] & - MASK])[0]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - const int64x2_t x = vld1q_s64(reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])); - const int64_t n = vgetq_lane_s64(x, 0); - const int32_t d = vgetq_lane_s32(x, 2); - const int64_t q = n / (d | 0x5); - - ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - - idx[hashBlock] = d ^ q; - } - } - -#undef BYTE - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + //dummy } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -1462,7 +915,7 @@ public: cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); @@ -1476,7 +929,85 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + + keccakf(h0, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + + // single + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + + { + keccak(input, (int) size, scratchPad[0]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t ah0 = h0[1] ^h0[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + + VARIANT4_RANDOM_MATH_INIT(0, h0) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + + idx0 = EXTRACT64(cx0); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10) + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0); al0 += hi; ah0 += lo; @@ -1628,7 +1159,11 @@ public: ((int64_t*) &l[idx & MASK])[0] = n ^ q; - idx = d ^ q; + if (VARIANT == POW_XHV) { + idx = (~d) ^ q; + } else { + idx = d ^ q; + } } cn_implode_scratchpad_heavy((__m128i*) scratchPad[0]->memory, (__m128i*) scratchPad[0]->state); @@ -1636,75 +1171,6 @@ public: extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - const uint8_t* l; - uint64_t* h; - uint64_t al; - uint64_t ah; - __m128i bx; - uint64_t idx; - - keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - - l = scratchPad[0]->memory; - h = reinterpret_cast(scratchPad[0]->state); - - cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); - - al = h[0] ^ h[4]; - ah = h[1] ^ h[5]; - bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); - idx = h[0] ^ h[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al)); - } else { - cx = _mm_load_si128((__m128i*) &l[idx & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); - } - - _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); - idx = EXTRACT64(cx); - bx = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[idx & MASK])[0]; - ch = ((uint64_t*) &l[idx & MASK])[1]; - lo = __umul128(idx, cl, &hi); - - al += hi; - ah += lo; - - ((uint64_t*) &l[idx & MASK])[0] = al; - ((uint64_t*) &l[idx & MASK])[1] = ah; - - ah ^= ch; - al ^= cl; - idx = al; - - const int64x2_t x = vld1q_s64(reinterpret_cast(&l[idx & MASK])); - const int64_t n = vgetq_lane_s64(x, 0); - const int32_t d = vgetq_lane_s32(x, 2); - const int64_t q = n / (d | 0x5); - - ((int64_t*) &l[idx & MASK])[0] = n ^ q; - - idx = (~d) ^ q; - } - - cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); - keccakf(h, 24); - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - } - - inline static void hashHeavyTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -1809,8 +1275,8 @@ public: }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -2067,8 +1533,8 @@ public: cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -2084,7 +1550,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -2107,7 +1573,136 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + + // double + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10) + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11) + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1); al1 += hi; ah1 += lo; @@ -2318,118 +1913,12 @@ public: ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; - idx0 = d0 ^ q0; - - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - - al1 += hi; - ah1 += lo; - - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; - - const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); - const int64_t n1 = vgetq_lane_s64(x1, 0); - const int32_t d1 = vgetq_lane_s32(x1, 2); - const int64_t q1 = n1 / (d1 | 0x5); - - ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; - - idx1 = d1 ^ q1; - } - - cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); - - keccakf(h0, 24); - keccakf(h1, 24); - - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - } - - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - keccak(input, (int) size, scratchPad[0]->state, 200); - keccak(input + size, (int) size, scratchPad[1]->state, 200); - - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - - cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + if (VARIANT == POW_XHV) { + idx0 = (~d0) ^ q0; } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + idx0 = d0 ^ q0; } - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - - bx0 = cx0; - bx1 = cx1; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); - const int64_t n0 = vgetq_lane_s64(x0, 0); - const int32_t d0 = vgetq_lane_s32(x0, 2); - const int64_t q0 = n0 / (d0 | 0x5); - - ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; - - idx0 = (~d0) ^ q0; - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; lo = __umul128(idx1, cl, &hi); @@ -2451,7 +1940,11 @@ public: ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; - idx1 = (~d1) ^ q1; + if (VARIANT == POW_XHV) { + idx1 = (~d1) ^ q1; + } else { + idx1 = d1 ^ q1; + } } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -2635,8 +2128,8 @@ public: } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -2986,9 +2479,9 @@ public: cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) - SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -3006,7 +2499,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -3029,7 +2522,7 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) al1 += hi; ah1 += lo; @@ -3052,7 +2545,185 @@ public: lo = __umul128(idx2, cl, &hi); - SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi) + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + + // triple + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + VARIANT4_RANDOM_MATH_INIT(2, h2) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10) + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11) + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + VARIANT4_RANDOM_MATH(2, al2, ah2, cl, bx02, bx12) + + if (VARIANT == POW_V4) { + al2 ^= r2[2] | ((uint64_t)(r2[3]) << 32); + ah2 ^= r2[0] | ((uint64_t)(r2[1]) << 32); + } + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) al2 += hi; ah2 += lo; @@ -3326,162 +2997,12 @@ public: ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; - idx0 = d0 ^ q0; - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - - al1 += hi; - ah1 += lo; - - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; - - const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); - const int64_t n1 = vgetq_lane_s64(x1, 0); - const int32_t d1 = vgetq_lane_s32(x1, 2); - const int64_t q1 = n1 / (d1 | 0x5); - - ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; - - idx1 = d1 ^ q1; - - - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); - - al2 += hi; - ah2 += lo; - - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; - - ah2 ^= ch; - al2 ^= cl; - idx2 = al2; - - - const int64x2_t x2 = vld1q_s64(reinterpret_cast(&l2[idx2 & MASK])); - const int64_t n2 = vgetq_lane_s64(x2, 0); - const int32_t d2 = vgetq_lane_s32(x2, 2); - const int64_t q2 = n2 / (d2 | 0x5); - - ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2; - - idx2 = d2 ^ q2; - } - - cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); - - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); - } - - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - keccak(input, (int) size, scratchPad[0]->state, 200); - keccak(input + size, (int) size, scratchPad[1]->state, 200); - keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); - - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - const uint8_t* l2 = scratchPad[2]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); - - cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + if (VARIANT == POW_XHV) { + idx0 = (~d0) ^ q0; } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); - - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); - cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + idx0 = d0 ^ q0; } - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); - - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; - - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); - const int64_t n0 = vgetq_lane_s64(x0, 0); - const int32_t d0 = vgetq_lane_s32(x0, 2); - const int64_t q0 = n0 / (d0 | 0x5); - - ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; - - idx0 = (~d0) ^ q0; - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; lo = __umul128(idx1, cl, &hi); @@ -3503,7 +3024,11 @@ public: ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; - idx1 = (~d1) ^ q1; + if (VARIANT == POW_XHV) { + idx1 = (~d1) ^ q1; + } else { + idx1 = d1 ^ q1; + } cl = ((uint64_t*) &l2[idx2 & MASK])[0]; ch = ((uint64_t*) &l2[idx2 & MASK])[1]; @@ -3519,6 +3044,7 @@ public: al2 ^= cl; idx2 = al2; + const int64x2_t x2 = vld1q_s64(reinterpret_cast(&l2[idx2 & MASK])); const int64_t n2 = vgetq_lane_s64(x2, 0); const int32_t d2 = vgetq_lane_s32(x2, 2); @@ -3526,7 +3052,11 @@ public: ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2; - idx2 = (~d2) ^ q2; + if (VARIANT == POW_XHV) { + idx2 = (~d2) ^ q2; + } else { + idx2 = d2 ^ q2; + } } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -3780,8 +3310,8 @@ public: } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -4220,10 +3750,10 @@ public: cx3 = _mm_aesenc_si128(cx3, ax3); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) - SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) - SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -4243,7 +3773,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -4266,7 +3796,7 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) al1 += hi; ah1 += lo; @@ -4289,7 +3819,7 @@ public: lo = __umul128(idx2, cl, &hi); - SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) al2 += hi; ah2 += lo; @@ -4312,7 +3842,235 @@ public: lo = __umul128(idx3, cl, &hi); - SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + } + + // quadruple + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + VARIANT4_RANDOM_MATH_INIT(2, h2) + VARIANT4_RANDOM_MATH_INIT(3, h3) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], ax3); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10) + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11) + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + VARIANT4_RANDOM_MATH(2, al2, ah2, cl, bx02, bx12) + + if (VARIANT == POW_V4) { + al2 ^= r2[2] | ((uint64_t)(r2[3]) << 32); + ah2 ^= r2[0] | ((uint64_t)(r2[1]) << 32); + } + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + + VARIANT4_RANDOM_MATH(3, al3, ah3, cl, bx03, bx13) + + if (VARIANT == POW_V4) { + al3 ^= r3[2] | ((uint64_t)(r3[3]) << 32); + ah3 ^= r3[0] | ((uint64_t)(r3[1]) << 32); + } + + lo = __umul128(idx3, cl, &hi); + + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3) al3 += hi; ah3 += lo; @@ -4550,14 +4308,6 @@ public: // not supported } - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - // not supported - } - inline static void hashHeavyTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -4567,8 +4317,8 @@ public: } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash {// public: inline static void hash(const uint8_t* __restrict__ input, @@ -5095,11 +4845,11 @@ public: cx4 = _mm_aesenc_si128(cx4, ax4); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) - SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) - SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) - SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -5121,7 +4871,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -5144,7 +4894,7 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) al1 += hi; ah1 += lo; @@ -5167,7 +4917,7 @@ public: lo = __umul128(idx2, cl, &hi); - SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) al2 += hi; ah2 += lo; @@ -5190,7 +4940,7 @@ public: lo = __umul128(idx3, cl, &hi); - SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) al3 += hi; ah3 += lo; @@ -5213,7 +4963,283 @@ public: lo = __umul128(idx4, cl, &hi); - SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi); + SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi, VARIANT == POW_RWZ) + + al4 += hi; + ah4 += lo; + + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; + + bx14 = bx04; + bx04 = cx4; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); + } + + // quintuple + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx04 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + __m128i bx14 = _mm_set_epi64x(h4[9] ^ h4[11], h4[8] ^ h4[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + VARIANT4_RANDOM_MATH_INIT(2, h2) + VARIANT4_RANDOM_MATH_INIT(3, h3) + VARIANT4_RANDOM_MATH_INIT(4, h4) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + const __m128i ax4 = _mm_set_epi64x(ah4, al4); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], ax3); + cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], ax4); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + cx4 = _mm_aesenc_si128(cx4, ax4); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3) + SHUFFLE_V4(l4, (idx4&MASK), bx04, bx14, ax4, cx4) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx04, cx4)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10) + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11) + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + VARIANT4_RANDOM_MATH(2, al2, ah2, cl, bx02, bx12) + + if (VARIANT == POW_V4) { + al2 ^= r2[2] | ((uint64_t)(r2[3]) << 32); + ah2 ^= r2[0] | ((uint64_t)(r2[1]) << 32); + } + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + + VARIANT4_RANDOM_MATH(3, al3, ah3, cl, bx03, bx13) + + if (VARIANT == POW_V4) { + al3 ^= r3[2] | ((uint64_t)(r3[3]) << 32); + ah3 ^= r3[0] | ((uint64_t)(r3[1]) << 32); + } + + lo = __umul128(idx3, cl, &hi); + + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; + + + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; + + VARIANT4_RANDOM_MATH(4, al4, ah4, cl, bx04, bx14) + + if (VARIANT == POW_V4) { + al4 ^= r4[2] | ((uint64_t)(r4[3]) << 32); + ah4 ^= r4[0] | ((uint64_t)(r4[1]) << 32); + } + + lo = __umul128(idx4, cl, &hi); + + SHUFFLE_V4(l4, (idx4&MASK), bx04, bx14, ax4, cx4); al4 += hi; ah4 += lo; @@ -5496,14 +5522,6 @@ public: // not supported } - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - // not supported - } - inline static void hashHeavyTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 836f2822..373c51cc 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -138,6 +138,53 @@ const static uint8_t test_output_xtl_v9[64] = { 0xF1, 0xF0, 0x55, 0x34, 0x15, 0x29, 0x93, 0x04, 0x2D, 0xED, 0xD2, 0x33, 0x50, 0x6E, 0xBE, 0x25 }; +// CN XCASH +const static uint8_t test_output_xcash[32] = { + 0xAE, 0xFB, 0xB3, 0xF0, 0xCC, 0x88, 0x04, 0x6D, 0x11, 0x9F, 0x6C, 0x54, 0xB9, 0x6D, 0x90, 0xC9, + 0xE8, 0x84, 0xEA, 0x3B, 0x59, 0x83, 0xA6, 0x0D, 0x50, 0xA4, 0x2D, 0x7D, 0x3E, 0xBE, 0x48, 0x21 +}; + +// CN ZELERIUS +const static uint8_t test_output_zelerius[32] = { + 0x51, 0x6E, 0x33, 0xC6, 0xE4, 0x46, 0xAB, 0xBC, 0xCD, 0xAD, 0x18, 0xC0, 0x4C, 0xD9, 0xA2, 0x5E, + 0x64, 0x10, 0x28, 0x53, 0xB2, 0x0A, 0x42, 0xDF, 0xDE, 0xAA, 0x8B, 0x59, 0x9E, 0xCF, 0x40, 0xE2 +}; + +// CN RWZ +const static uint8_t test_output_rwz[64] = { + 0x5f, 0x56, 0xc6, 0xb0, 0x99, 0x6b, 0xa2, 0x3e, 0x0b, 0xba, 0x07, 0x29, 0xc9, 0x90, 0x74, 0x85, + 0x5a, 0x10, 0xe3, 0x08, 0x7f, 0xdb, 0xfe, 0x94, 0x75, 0x33, 0x54, 0x73, 0x76, 0xf0, 0x75, 0xb8, + 0x8b, 0x70, 0x43, 0x9a, 0xfc, 0xf5, 0xeb, 0x15, 0xbb, 0xf9, 0xad, 0x9d, 0x2a, 0xbd, 0x72, 0x52, + 0x49, 0x54, 0x0b, 0x91, 0xea, 0x61, 0x7f, 0x98, 0x7d, 0x39, 0x17, 0xb7, 0xd7, 0x65, 0xff, 0x75 +}; + +// CN V9 aka CN V4/V5 aka CN-R (height 10000) +const static uint8_t test_output_v4[160] = { + 0x90, 0x20, 0x14, 0x86, 0x1E, 0xCD, 0x01, 0xC5, 0x43, 0xB5, 0x61, 0xFA, 0xC8, 0x3D, 0xFF, 0x7D, + 0x76, 0x67, 0xC2, 0xD7, 0xB3, 0xD4, 0xE3, 0x4B, 0x4C, 0x7E, 0x6D, 0x04, 0x31, 0x79, 0xE6, 0x96, + 0xEA, 0xF4, 0x14, 0x76, 0x38, 0x94, 0x7C, 0xCE, 0x02, 0x50, 0x7A, 0x31, 0xB8, 0x4D, 0xDD, 0x3B, + 0x92, 0xAA, 0xC6, 0x49, 0xA1, 0x64, 0xA1, 0xA8, 0x7C, 0xD9, 0x43, 0x14, 0xC5, 0x12, 0x86, 0x61, + 0x0A, 0x18, 0xBD, 0x11, 0x36, 0x06, 0x31, 0x0D, 0x9D, 0xC0, 0x8C, 0x41, 0x88, 0xCB, 0x7C, 0xE9, + 0x5D, 0xD2, 0xBA, 0xA5, 0xFB, 0x0D, 0x2B, 0xA6, 0x6E, 0x7C, 0x78, 0x72, 0x38, 0xFE, 0x53, 0x17, + 0x1A, 0x96, 0x89, 0x0E, 0x14, 0xFF, 0x34, 0x42, 0xC0, 0x5A, 0xAB, 0xC0, 0x3F, 0x39, 0x4E, 0x43, + 0x91, 0x38, 0x67, 0x79, 0x5B, 0xAE, 0xCC, 0xA7, 0xDB, 0x4C, 0xFE, 0x8B, 0x75, 0x76, 0x1F, 0xC4, + 0x98, 0x71, 0xE6, 0xC1, 0x08, 0x9D, 0xED, 0xCC, 0x47, 0xC3, 0xF3, 0x7A, 0xA9, 0x4A, 0x3A, 0xB9, + 0xAC, 0xB8, 0x5C, 0x9F, 0xCC, 0xCB, 0xC1, 0x93, 0x9E, 0xC6, 0x6D, 0xCC, 0x45, 0xF4, 0xBA, 0xBD +}; + +// CN V9 aka CN V4/V5 aka CN-R (height 10001) +const static uint8_t test_output_v4_1[32] = { + 0x82, 0x58, 0x7D, 0x63, 0x7B, 0x6C, 0x0C, 0x96, 0x6A, 0x50, 0xF6, 0xC0, 0xAB, 0xB5, 0xEA, 0x1A, + 0x58, 0x2B, 0xEA, 0x7E, 0xF0, 0x2F, 0x3C, 0xA1, 0x7C, 0x1C, 0x7C, 0x2E, 0xF9, 0xE5, 0x66, 0xF2 +}; + +// CN V9 aka CN V4/V5 aka CN-R (height 10002) +const static uint8_t test_output_v4_2[32] = { + 0x64, 0xB2, 0x4E, 0x48, 0x4A, 0x28, 0xBF, 0x11, 0xC4, 0x8A, 0x68, 0xE7, 0xB7, 0x4B, 0xFD, 0xA7, + 0xFB, 0x95, 0x66, 0x05, 0x0C, 0xF7, 0xFA, 0xA7, 0x4B, 0xD9, 0x18, 0x59, 0x88, 0x7F, 0x47, 0xA2 +}; + + // CN-LITE const static uint8_t test_output_v0_lite[160] = { 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 0c9127f1..cbe970e7 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -36,10 +36,24 @@ # define __restrict__ __restrict #endif +#define SWAP32LE(x) x +#define SWAP64LE(x) x +#define hash_extra_blake(data, length, hash) blake256_hash((uint8_t*)(hash), (uint8_t*)(data), (length)) + +#ifndef NOINLINE +#ifdef __GNUC__ +#define NOINLINE __attribute__ ((noinline)) +#elif _MSC_VER +#define NOINLINE __declspec(noinline) +#else +#define NOINLINE +#endif +#endif #include "crypto/CryptoNight.h" #include "crypto/soft_aes.h" #include "AsmOptimization.h" +#include "variant4_random_math.h" extern "C" { @@ -71,6 +85,19 @@ extern "C" void cnv2_main_loop_ultralite_bulldozer_asm(ScratchPad* ctx0); void cnv2_double_main_loop_ultralite_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + void cnv2_main_loop_xcash_ivybridge_asm(ScratchPad* ctx0); + void cnv2_main_loop_xcash_ryzen_asm(ScratchPad* ctx0); + void cnv2_main_loop_xcash_bulldozer_asm(ScratchPad* ctx0); + void cnv2_double_main_loop_xcash_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + + void cnv2_main_loop_zelerius_ivybridge_asm(ScratchPad* ctx0); + void cnv2_main_loop_zelerius_ryzen_asm(ScratchPad* ctx0); + void cnv2_main_loop_zelerius_bulldozer_asm(ScratchPad* ctx0); + void cnv2_double_main_loop_zelerius_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + + void cnv2_main_loop_rwz_all_asm(ScratchPad* ctx0); + void cnv2_double_main_loop_rwz_all_asm(ScratchPad* ctx0, ScratchPad* ctx1); + void cnv1_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_lite_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_fast_soft_aes_sandybridge_asm(ScratchPad* ctx0); @@ -80,6 +107,16 @@ extern "C" void cnv2_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(ScratchPad* ctx); + void cnv2_main_loop_xcash_soft_aes_sandybridge_asm(ScratchPad* ctx); + void cnv2_main_loop_zelerius_soft_aes_sandybridge_asm(ScratchPad* ctx); + + void wow_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM); + void wow_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM); + void wow_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM); + + void v4_soft_aes_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM); + void v4_compile_code(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM); + void v4_compile_code_double(const V4_Instruction* code, int code_size, void* machine_code, AsmOptimization ASM); #endif } @@ -148,24 +185,22 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin } #endif -#ifdef _MSC_VER -#else -#endif - #ifdef _MSC_VER # define SET_ROUNDING_MODE_UP() _control87(RC_UP, MCW_RC); +# define SET_ROUNDING_MODE_DOWN() _control87(RC_DOWN, MCW_RC); #else # define SET_ROUNDING_MODE_UP() std::fesetround(FE_UPWARD); +# define SET_ROUNDING_MODE_DOWN() fesetround(FE_DOWNWARD); #endif -# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax) \ +# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax, reverse) \ { \ - const __m128i chunk1 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))); \ - const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \ - const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \ - _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \ - _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \ - _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \ } # define INTEGER_MATH_V2(idx, cl, cx) \ @@ -179,18 +214,47 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin sqrt_result##idx = int_sqrt_v2(cx_ + division_result); \ } -# define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi) \ +# define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi, reverse) \ { \ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \ const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \ hi ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[0]; \ lo ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[1]; \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \ + _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \ +} + +# define SHUFFLE_V4(l, idx, bx0, bx1, ax, cx) \ +{ \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \ _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \ _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \ _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \ + cx = _mm_xor_si128(_mm_xor_si128(cx, chunk3), _mm_xor_si128(chunk1, chunk2)); \ } +# define VARIANT4_RANDOM_MATH_INIT(idx, h) \ + uint32_t r##idx[9]; \ + struct V4_Instruction code##idx[256]; \ + r##idx[0] = (uint32_t)(h[12]); \ + r##idx[1] = (uint32_t)(h[12] >> 32); \ + r##idx[2] = (uint32_t)(h[13]); \ + r##idx[3] = (uint32_t)(h[13] >> 32); \ + v4_random_math_init(code##idx, VARIANT, height); + +# define VARIANT4_RANDOM_MATH(idx, al, ah, cl, bx0, bx1) \ + cl ^= (r##idx[0] + r##idx[1]) | ((uint64_t)(r##idx[2] + r##idx[3]) << 32); \ + r##idx[4] = static_cast(al); \ + r##idx[5] = static_cast(ah); \ + r##idx[6] = static_cast(_mm_cvtsi128_si32(bx0)); \ + r##idx[7] = static_cast(_mm_cvtsi128_si32(bx1)); \ + r##idx[8] = static_cast(_mm_cvtsi128_si32(_mm_srli_si128(bx1, 8))); \ + v4_random_math(code##idx, r##idx); \ + static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) { blake256_hash(output, input, len); } @@ -592,7 +656,7 @@ return r; } // n-Loop version. Seems to be little bit slower then the hardcoded one. -template +template class CryptoNightMultiHash { public: @@ -601,78 +665,7 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - bx[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + // dummy } inline static void hashPowV2(const uint8_t* __restrict__ input, @@ -680,220 +673,53 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); - tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ - *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - bx[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + // dummy } inline static void hashPowV2_asm(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { - // not supported + // dummy } - // multi inline static void hashPowV3(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t sqrt_result[NUM_HASH_BLOCKS]; - __m128i bx0[NUM_HASH_BLOCKS]; - __m128i bx1[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - __m128i division_result_xmm[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx0[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - bx1[hashBlock] = _mm_set_epi64x(h[hashBlock][9] ^ h[hashBlock][11], h[hashBlock][8] ^ h[hashBlock][10]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - - division_result_xmm[hashBlock] = _mm_cvtsi64_si128(h[hashBlock][12]); - sqrt_result[hashBlock] = h[hashBlock][13]; - } - - SET_ROUNDING_MODE_UP(); - - uint64_t sqrt_result0; - __m128i division_result_xmm0; - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - SHUFFLE_PHASE_1(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock]) - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx0[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1]; - - sqrt_result0 = sqrt_result[hashBlock]; - division_result_xmm0 = division_result_xmm[hashBlock]; - - INTEGER_MATH_V2(0, cl, cx[hashBlock]) - - sqrt_result[hashBlock] = sqrt_result0; - division_result_xmm[hashBlock] = division_result_xmm0; - - lo = __umul128(idx[hashBlock], cl, &hi); - - SHUFFLE_PHASE_2(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock], lo, hi) - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - bx1[hashBlock] = bx0[hashBlock]; - bx0[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + // dummy } inline static void hashPowV3_asm(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { - // not supported + // dummy + } + + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + // dummy + } + + inline static void hashPowV4_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height, + AsmOptimization asmOptimization) + { + // dummy } inline static void hashLiteTube(const uint8_t* __restrict__ input, @@ -901,94 +727,7 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); - tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ - *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - bx[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + // dummy } inline static void hashHeavy(const uint8_t* __restrict__ input, @@ -996,171 +735,7 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; - int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - idx[hashBlock] = d ^ q; - - bx[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } - } - - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - __m128i ax[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - if (SOFT_AES) { - cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]); - } else { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]); - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - uint64_t hi, lo, cl, ch; - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; - int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - idx[hashBlock] = (~d) ^ q; - - bx[hashBlock] = cx[hashBlock]; - } - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + // dummy } inline static void hashHeavyTube(const uint8_t* __restrict__ input, @@ -1168,130 +743,12 @@ public: uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad) { - const uint8_t* l[NUM_HASH_BLOCKS]; - uint64_t* h[NUM_HASH_BLOCKS]; - uint64_t al[NUM_HASH_BLOCKS]; - uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; - uint64_t tweak1_2[NUM_HASH_BLOCKS]; - __m128i bx[NUM_HASH_BLOCKS]; - __m128i cx[NUM_HASH_BLOCKS]; - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); - tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ - *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = scratchPad[hashBlock]->memory; - h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); - - cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); - - al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); - idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; - } - - union alignas(16) { - uint32_t k[4]; - uint64_t v64[2]; - }; - alignas(16) uint32_t x[4]; - -#define BYTE(p, i) ((unsigned char*)&p)[i] - - for (size_t i = 0; i < ITERATIONS; i++) { - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - const __m128i &key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); - - _mm_store_si128((__m128i *) k, key); - cx[hashBlock] = _mm_xor_si128(cx[hashBlock], _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); - _mm_store_si128((__m128i *) x, cx[hashBlock]); - - k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ - saes_table[3][BYTE(x[3], 3)]; - x[0] ^= k[0]; - k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ - saes_table[3][BYTE(x[0], 3)]; - x[1] ^= k[1]; - k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ - saes_table[3][BYTE(x[1], 3)]; - x[2] ^= k[2]; - k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ - saes_table[3][BYTE(x[2], 3)]; - - cx[hashBlock] = _mm_load_si128((__m128i *) k); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], - _mm_xor_si128(bx[hashBlock], cx[hashBlock])); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - idx[hashBlock] = EXTRACT64(cx[hashBlock]); - } - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; - ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; - lo = __umul128(idx[hashBlock], cl, &hi); - - al[hashBlock] += hi; - ah[hashBlock] += lo; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; - ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; - - ah[hashBlock] ^= tweak1_2[hashBlock]; - - ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; - - ah[hashBlock] ^= ch; - al[hashBlock] ^= cl; - idx[hashBlock] = al[hashBlock]; - - int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; - int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - idx[hashBlock] = d ^ q; - - bx[hashBlock] = cx[hashBlock]; - } - } - -#undef BYTE - - for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); - keccakf(h[hashBlock], 24); - extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, - output + hashBlock * 32); - } + // dummy } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -1425,8 +882,7 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); @@ -1447,7 +903,7 @@ public: if (SOFT_AES) { scratchPad[0]->t_fn = (const uint32_t*)saes_table; - switch (powVariant) + switch (VARIANT) { case POW_MSR: cnv1_main_loop_fast_soft_aes_sandybridge_asm(scratchPad[0]); @@ -1468,7 +924,7 @@ public: break; } } else { - switch (powVariant) + switch (VARIANT) { case POW_MSR: cnv1_main_loop_fast_sandybridge_asm(scratchPad[0]); @@ -1533,7 +989,7 @@ public: cx = _mm_aesenc_si128(cx, ax); } - SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax) + SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx)); @@ -1547,7 +1003,7 @@ public: lo = __umul128(idx, cl, &hi); - SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi) + SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi, VARIANT == POW_RWZ) al += hi; // two fence statements are overhead ah += lo; @@ -1568,14 +1024,12 @@ public: extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } - // single asm inline static void hashPowV3_asm(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { const uint8_t* l = scratchPad[0]->memory; uint64_t* h = reinterpret_cast(scratchPad[0]->state); @@ -1589,7 +1043,7 @@ public: scratchPad[0]->input = input; scratchPad[0]->t_fn = (const uint32_t*)saes_table; - switch (powVariant) + switch (VARIANT) { case POW_FAST_2: cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(scratchPad[0]); @@ -1597,12 +1051,18 @@ public: case POW_TURTLE: cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(scratchPad[0]); break; + case POW_DOUBLE: + cnv2_main_loop_xcash_soft_aes_sandybridge_asm(scratchPad[0]); + break; + case POW_ZELERIUS: + cnv2_main_loop_zelerius_soft_aes_sandybridge_asm(scratchPad[0]); + break; default: cnv2_main_loop_soft_aes_sandybridge_asm(scratchPad[0]); break; } } else { - switch (powVariant) + switch (VARIANT) { case POW_FAST_2: cnv2_main_loop_fastv2_ivybridge_asm(scratchPad[0]); @@ -1610,13 +1070,22 @@ public: case POW_TURTLE: cnv2_main_loop_ultralite_ivybridge_asm(scratchPad[0]); break; + case POW_DOUBLE: + cnv2_main_loop_xcash_ivybridge_asm(scratchPad[0]); + break; + case POW_ZELERIUS: + cnv2_main_loop_zelerius_ivybridge_asm(scratchPad[0]); + break; + case POW_RWZ: + cnv2_main_loop_rwz_all_asm(scratchPad[0]); + break; default: cnv2_main_loop_ivybridge_asm(scratchPad[0]); break; } } } else if (asmOptimization == AsmOptimization::ASM_RYZEN) { - switch (powVariant) + switch (VARIANT) { case POW_FAST_2: cnv2_main_loop_fastv2_ryzen_asm(scratchPad[0]); @@ -1624,12 +1093,21 @@ public: case POW_TURTLE: cnv2_main_loop_ultralite_ryzen_asm(scratchPad[0]); break; + case POW_DOUBLE: + cnv2_main_loop_xcash_ryzen_asm(scratchPad[0]); + break; + case POW_ZELERIUS: + cnv2_main_loop_zelerius_ryzen_asm(scratchPad[0]); + break; + case POW_RWZ: + cnv2_main_loop_rwz_all_asm(scratchPad[0]); + break; default: cnv2_main_loop_ryzen_asm(scratchPad[0]); break; } } else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) { - switch (powVariant) + switch (VARIANT) { case POW_FAST_2: cnv2_main_loop_fastv2_bulldozer_asm(scratchPad[0]); @@ -1637,6 +1115,15 @@ public: case POW_TURTLE: cnv2_main_loop_ultralite_bulldozer_asm(scratchPad[0]); break; + case POW_DOUBLE: + cnv2_main_loop_xcash_bulldozer_asm(scratchPad[0]); + break; + case POW_ZELERIUS: + cnv2_main_loop_zelerius_bulldozer_asm(scratchPad[0]); + break; + case POW_RWZ: + cnv2_main_loop_rwz_all_asm(scratchPad[0]); + break; default: cnv2_main_loop_bulldozer_asm(scratchPad[0]); break; @@ -1649,6 +1136,140 @@ public: extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } + // single + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + + const uint8_t*l = scratchPad[0]->memory; + uint64_t* h = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + + uint64_t al = h[0] ^ h[4]; + uint64_t ah = h[1] ^ h[5]; + + __m128i bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + __m128i bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]); + + uint64_t idx = h[0] ^ h[4]; + + VARIANT4_RANDOM_MATH_INIT(0, h) + + SET_ROUNDING_MODE_UP(); + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; + + const __m128i ax = _mm_set_epi64x(ah, al); + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*)&l[idx & MASK], ax); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, ax); + } + + SHUFFLE_V4(l, (idx&MASK), bx0, bx1, ax, cx) + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx)); + + idx = EXTRACT64(cx); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al, ah, cl, bx0, bx1) + + if (VARIANT == POW_V4) { + al ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx, cl, &hi); + + SHUFFLE_V4(l, (idx&MASK), bx0, bx1, ax, cx) + + al += hi; // two fence statements are overhead + ah += lo; + + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + + ah ^= ch; + al ^= cl; + idx = al; + + bx1 = bx0; + bx0 = cx; + } + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + + // single asm + inline static void hashPowV4_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height, + AsmOptimization asmOptimization) + { + const uint8_t* l = scratchPad[0]->memory; + uint64_t* h = reinterpret_cast(scratchPad[0]->state); + + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + +#ifndef XMRIG_NO_ASM + if (SOFT_AES) { + if (!scratchPad[0]->generated_code_data.match(VARIANT, height)) { + V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, VARIANT, height); + + if (VARIANT == POW_WOW) { + wow_soft_aes_compile_code(code, code_size, reinterpret_cast(scratchPad[0]->generated_code), ASM_OFF); + } else { + v4_soft_aes_compile_code(code, code_size, reinterpret_cast(scratchPad[0]->generated_code), ASM_OFF); + } + + scratchPad[0]->generated_code_data.variant = VARIANT; + scratchPad[0]->generated_code_data.height = height; + } + + scratchPad[0]->input = input; + scratchPad[0]->t_fn = (const uint32_t*)saes_table; + scratchPad[0]->generated_code(scratchPad[0]); + } else { + if (!scratchPad[0]->generated_code_data.match(VARIANT, height)) { + V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, VARIANT, height); + + if (VARIANT == POW_WOW) { + wow_compile_code(code, code_size, reinterpret_cast(scratchPad[0]->generated_code), asmOptimization); + } else { + v4_compile_code(code, code_size, reinterpret_cast(scratchPad[0]->generated_code), asmOptimization); + } + + scratchPad[0]->generated_code_data.variant = VARIANT; + scratchPad[0]->generated_code_data.height = height; + } + + scratchPad[0]->generated_code(scratchPad[0]); + } +#endif + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, @@ -1777,73 +1398,12 @@ public: int64_t q = n / (d | 0x5); ((int64_t*)&l[idx & MASK])[0] = n ^ q; - idx = d ^ q; - } - cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); - keccakf(h, 24); - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - } - - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - const uint8_t* l; - uint64_t* h; - uint64_t al; - uint64_t ah; - __m128i bx; - uint64_t idx; - - keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - - l = scratchPad[0]->memory; - h = reinterpret_cast(scratchPad[0]->state); - - cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); - - al = h[0] ^ h[4]; - ah = h[1] ^ h[5]; - bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); - idx = h[0] ^ h[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); + if (VARIANT == POW_XHV) { + idx = (~d) ^ q; } else { - cx = _mm_load_si128((__m128i*) &l[idx & MASK]); - cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); + idx = d ^ q; } - - _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); - idx = EXTRACT64(cx); - bx = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[idx & MASK])[0]; - ch = ((uint64_t*) &l[idx & MASK])[1]; - lo = __umul128(idx, cl, &hi); - - al += hi; - ah += lo; - - ((uint64_t*) &l[idx & MASK])[0] = al; - ((uint64_t*) &l[idx & MASK])[1] = ah; - - ah ^= ch; - al ^= cl; - idx = al; - - int64_t n = ((int64_t*)&l[idx & MASK])[0]; - int32_t d = ((int32_t*)&l[idx & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l[idx & MASK])[0] = n ^ q; - idx = (~d) ^ q; } cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); @@ -1947,8 +1507,8 @@ public: } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -2154,8 +1714,7 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { // not supported } @@ -2221,8 +1780,8 @@ public: cx1 = _mm_aesenc_si128(cx1, ax1); } - SHUFFLE_PHASE_1(l0, (idx0 & MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1 & MASK), bx01, bx11, ax1) + SHUFFLE_PHASE_1(l0, (idx0 & MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1 & MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -2243,7 +1802,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0 & MASK), bx00, bx10, ax0, lo, hi) + SHUFFLE_PHASE_2(l0, (idx0 & MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -2312,7 +1871,7 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1 & MASK), bx01, bx11, ax1, lo, hi) + SHUFFLE_PHASE_2(l1, (idx1 & MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) al1 += hi; ah1 += lo; @@ -2343,8 +1902,7 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); @@ -2358,13 +1916,22 @@ public: cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); #ifndef XMRIG_NO_ASM - switch(powVariant) { + switch(VARIANT) { case POW_FAST_2: cnv2_double_main_loop_fastv2_sandybridge_asm(scratchPad[0], scratchPad[1]); break; case POW_TURTLE: cnv2_double_main_loop_ultralite_sandybridge_asm(scratchPad[0], scratchPad[1]); break; + case POW_DOUBLE: + cnv2_double_main_loop_xcash_sandybridge_asm(scratchPad[0], scratchPad[1]); + break; + case POW_ZELERIUS: + cnv2_double_main_loop_zelerius_sandybridge_asm(scratchPad[0], scratchPad[1]); + break; + case POW_RWZ: + cnv2_double_main_loop_rwz_all_asm(scratchPad[0], scratchPad[1]); + break; default: cnv2_double_main_loop_sandybridge_asm(scratchPad[0], scratchPad[1]); break; @@ -2380,6 +1947,184 @@ public: extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } + // double + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + SET_ROUNDING_MODE_UP(); + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10); + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11); + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + + inline static void hashPowV4_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height, + AsmOptimization asmOptimization) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + +#ifndef XMRIG_NO_ASM + if (!scratchPad[0]->generated_code_double_data.match(VARIANT, height)) { + V4_Instruction code[256]; + const int code_size = v4_random_math_init(code, VARIANT, height); + + if (VARIANT == POW_WOW) { + wow_compile_code_double(code, code_size, reinterpret_cast(scratchPad[0]->generated_code_double), asmOptimization); + } else { + v4_compile_code_double(code, code_size, reinterpret_cast(scratchPad[0]->generated_code_double), asmOptimization); + } + + scratchPad[0]->generated_code_double_data.variant = VARIANT; + scratchPad[0]->generated_code_double_data.height = height; + } + + scratchPad[0]->generated_code_double(scratchPad[0], scratchPad[1]); +#endif + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -2566,120 +2311,14 @@ public: int64_t q = n / (d | 0x5); ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; - idx[0] = d ^ q; - bx0 = cx0; - - cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; - ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; - lo = __umul128(idx[1], cl, &hi); - - al1 += hi; - ah1 += lo; - - ((uint64_t*) &l1[idx[1] & MASK])[0] = al1; - ((uint64_t*) &l1[idx[1] & MASK])[1] = ah1; - - ah1 ^= ch; - al1 ^= cl; - idx[1] = al1; - - n = ((int64_t*)&l1[idx[1] & MASK])[0]; - d = ((int32_t*)&l1[idx[1] & MASK])[2]; - q = n / (d | 0x5); - - ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; - idx[1] = d ^ q; - - bx1 = cx1; - - } - - cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); - - keccakf(h0, 24); - keccakf(h1, 24); - - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - } - - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); - keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - - cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - - uint64_t idx[2]; - - idx[0] = al0; - idx[1] = al1; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1)); + if (VARIANT == POW_XHV) { + idx[0] = (~d) ^ q; } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]); - - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + idx[0] = d ^ q; } - _mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1)); - - idx[0] = EXTRACT64(cx0); - idx[1] = EXTRACT64(cx1); - bx0 = cx0; - bx1 = cx1; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx[0] & MASK])[0]; - ch = ((uint64_t*) &l0[idx[0] & MASK])[1]; - lo = __umul128(idx[0], cl, &hi); - - al0 += hi; - ah0 += lo; - - ((uint64_t*) &l0[idx[0] & MASK])[0] = al0; - ((uint64_t*) &l0[idx[0] & MASK])[1] = ah0; - - ah0 ^= ch; - al0 ^= cl; - idx[0] = al0; - - int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; - idx[0] = (~d) ^ q; - cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; @@ -2700,7 +2339,15 @@ public: q = n / (d | 0x5); ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; - idx[1] = (~d) ^ q; + + if (VARIANT == POW_XHV) { + idx[1] = (~d) ^ q; + } else { + idx[1] = d ^ q; + } + + bx1 = cx1; + } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -2870,8 +2517,8 @@ public: } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -3154,8 +2801,7 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { // not supported } @@ -3234,9 +2880,9 @@ public: cx2 = _mm_aesenc_si128(cx2, ax2); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) - SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -3254,7 +2900,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -3277,7 +2923,7 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) al1 += hi; ah1 += lo; @@ -3299,7 +2945,7 @@ public: lo = __umul128(idx2, cl, &hi); - SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi) + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) al2 += hi; ah2 += lo; @@ -3332,8 +2978,197 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) + { + // not supported + } + + // triple + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + SET_ROUNDING_MODE_UP(); + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + VARIANT4_RANDOM_MATH_INIT(2, h2) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10); + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11); + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + VARIANT4_RANDOM_MATH(2, al2, ah2, cl, bx02, bx12); + + if (VARIANT == POW_V4) { + al2 ^= r2[2] | ((uint64_t)(r2[3]) << 32); + ah2 ^= r2[0] | ((uint64_t)(r2[1]) << 32); + } + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + + inline static void hashPowV4_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height, + AsmOptimization asmOptimization) { // not supported } @@ -3582,158 +3417,13 @@ public: int64_t q = n / (d | 0x5); ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; - idx[0] = d ^ q; - - cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; - ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; - lo = __umul128(idx[1], cl, &hi); - - al1 += hi; - ah1 += lo; - - ((uint64_t*) &l1[idx[1] & MASK])[0] = al1; - ((uint64_t*) &l1[idx[1] & MASK])[1] = ah1; - - ah1 ^= ch; - al1 ^= cl; - idx[1] = al1; - - n = ((int64_t*)&l1[idx[1] & MASK])[0]; - d = ((int32_t*)&l1[idx[1] & MASK])[2]; - q = n / (d | 0x5); - - ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; - idx[1] = d ^ q; - - - cl = ((uint64_t*) &l2[idx[2] & MASK])[0]; - ch = ((uint64_t*) &l2[idx[2] & MASK])[1]; - lo = __umul128(idx[2], cl, &hi); - - al2 += hi; - ah2 += lo; - - ((uint64_t*) &l2[idx[2] & MASK])[0] = al2; - ((uint64_t*) &l2[idx[2] & MASK])[1] = ah2; - - ah2 ^= ch; - al2 ^= cl; - idx[2] = al2; - - n = ((int64_t*)&l2[idx[2] & MASK])[0]; - d = ((int32_t*)&l2[idx[2] & MASK])[2]; - q = n / (d | 0x5); - - ((int64_t*)&l2[idx[2] & MASK])[0] = n ^ q; - idx[2] = d ^ q; - } - - cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); - - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); - } - - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); - keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); - - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - const uint8_t* l2 = scratchPad[2]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); - - cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - - uint64_t idx[2]; - - idx[0] = al0; - idx[1] = al1; - idx[2] = al2; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx[0] & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx[1] & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx[2] & MASK], _mm_set_epi64x(ah2, al2)); + if (VARIANT == POW_XHV) { + idx[0] = (~d) ^ q; } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx[0] & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx[1] & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx[2] & MASK]); - - cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); - cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); - cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + idx[0] = d ^ q; } - _mm_store_si128((__m128i*) &l0[idx[0] & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx[1] & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx[2] & MASK], _mm_xor_si128(bx2, cx2)); - - idx[0] = EXTRACT64(cx0); - idx[1] = EXTRACT64(cx1); - idx[2] = EXTRACT64(cx2); - - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; - - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx[0] & MASK])[0]; - ch = ((uint64_t*) &l0[idx[0] & MASK])[1]; - lo = __umul128(idx[0], cl, &hi); - - al0 += hi; - ah0 += lo; - - ((uint64_t*) &l0[idx[0] & MASK])[0] = al0; - ((uint64_t*) &l0[idx[0] & MASK])[1] = ah0; - - ah0 ^= ch; - al0 ^= cl; - idx[0] = al0; - - int64_t n = ((int64_t*)&l0[idx[0] & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx[0] & MASK])[2]; - int64_t q = n / (d | 0x5); - - ((int64_t*)&l0[idx[0] & MASK])[0] = n ^ q; - idx[0] = (~d) ^ q; - - cl = ((uint64_t*) &l1[idx[1] & MASK])[0]; ch = ((uint64_t*) &l1[idx[1] & MASK])[1]; lo = __umul128(idx[1], cl, &hi); @@ -3753,8 +3443,12 @@ public: q = n / (d | 0x5); ((int64_t*)&l1[idx[1] & MASK])[0] = n ^ q; - idx[1] = (~d) ^ q; + if (VARIANT == POW_XHV) { + idx[1] = (~d) ^ q; + } else { + idx[1] = d ^ q; + } cl = ((uint64_t*) &l2[idx[2] & MASK])[0]; ch = ((uint64_t*) &l2[idx[2] & MASK])[1]; @@ -3775,7 +3469,12 @@ public: q = n / (d | 0x5); ((int64_t*)&l2[idx[2] & MASK])[0] = n ^ q; - idx[2] = (~d) ^ q; + + if (VARIANT == POW_XHV) { + idx[2] = (~d) ^ q; + } else { + idx[2] = d ^ q; + } } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -4010,8 +3709,8 @@ public: } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -4367,8 +4066,7 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { // not supported } @@ -4464,10 +4162,10 @@ public: cx3 = _mm_aesenc_si128(cx3, ax3); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) - SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) - SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -4487,7 +4185,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -4510,7 +4208,7 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) al1 += hi; ah1 += lo; @@ -4533,7 +4231,7 @@ public: lo = __umul128(idx2, cl, &hi); - SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) al2 += hi; ah2 += lo; @@ -4556,7 +4254,7 @@ public: lo = __umul128(idx3, cl, &hi); - SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) al3 += hi; ah3 += lo; @@ -4592,8 +4290,249 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) + { + // not supported + } + + // quadruple + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + SET_ROUNDING_MODE_UP(); + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + VARIANT4_RANDOM_MATH_INIT(2, h2) + VARIANT4_RANDOM_MATH_INIT(3, h3) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], ax3); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10); + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11); + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + VARIANT4_RANDOM_MATH(2, al2, ah2, cl, bx02, bx12); + + if (VARIANT == POW_V4) { + al2 ^= r2[2] | ((uint64_t)(r2[3]) << 32); + ah2 ^= r2[0] | ((uint64_t)(r2[1]) << 32); + } + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + + VARIANT4_RANDOM_MATH(3, al3, ah3, cl, bx03, bx13); + + if (VARIANT == POW_V4) { + al3 ^= r3[2] | ((uint64_t)(r3[3]) << 32); + ah3 ^= r3[0] | ((uint64_t)(r3[1]) << 32); + } + + lo = __umul128(idx3, cl, &hi); + + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + } + + + inline static void hashPowV4_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height, + AsmOptimization asmOptimization) { // not supported } @@ -4804,14 +4743,6 @@ public: // not supported } - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - // not supported - } - inline static void hashHeavyTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -4821,8 +4752,8 @@ public: } }; -template -class CryptoNightMultiHash +template +class CryptoNightMultiHash { public: inline static void hash(const uint8_t* __restrict__ input, @@ -5250,8 +5181,7 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) { // not supported } @@ -5362,11 +5292,11 @@ public: cx4 = _mm_aesenc_si128(cx4, ax4); } - SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1) - SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2) - SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3) - SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) + SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4, VARIANT == POW_RWZ) _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); @@ -5388,7 +5318,7 @@ public: lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) al0 += hi; ah0 += lo; @@ -5411,7 +5341,7 @@ public: lo = __umul128(idx1, cl, &hi); - SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi); + SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) al1 += hi; ah1 += lo; @@ -5434,7 +5364,7 @@ public: lo = __umul128(idx2, cl, &hi); - SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi); + SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) al2 += hi; ah2 += lo; @@ -5457,7 +5387,7 @@ public: lo = __umul128(idx3, cl, &hi); - SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi); + SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) al3 += hi; ah3 += lo; @@ -5480,7 +5410,7 @@ public: lo = __umul128(idx4, cl, &hi); - SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi); + SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi, VARIANT == POW_RWZ) al4 += hi; ah4 += lo; @@ -5519,8 +5449,296 @@ public: size_t size, uint8_t* __restrict__ output, ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization, - PowVariant powVariant) + AsmOptimization asmOptimization) + { + // not supported + } + + // quintuple + inline static void hashPowV4(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx04 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]); + __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]); + __m128i bx14 = _mm_set_epi64x(h4[9] ^ h4[11], h4[8] ^ h4[10]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + SET_ROUNDING_MODE_UP() + + VARIANT4_RANDOM_MATH_INIT(0, h0) + VARIANT4_RANDOM_MATH_INIT(1, h1) + VARIANT4_RANDOM_MATH_INIT(2, h2) + VARIANT4_RANDOM_MATH_INIT(3, h3) + VARIANT4_RANDOM_MATH_INIT(4, h4) + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + const __m128i ax2 = _mm_set_epi64x(ah2, al2); + const __m128i ax3 = _mm_set_epi64x(ah3, al3); + const __m128i ax4 = _mm_set_epi64x(ah4, al4); + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0); + cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1); + cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2); + cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], ax3); + cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], ax4); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + cx2 = _mm_aesenc_si128(cx2, ax2); + cx3 = _mm_aesenc_si128(cx3, ax3); + cx4 = _mm_aesenc_si128(cx4, ax4); + } + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1) + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2) + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3) + SHUFFLE_V4(l4, (idx4&MASK), bx04, bx14, ax4, cx4) + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx04, cx4)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + + VARIANT4_RANDOM_MATH(0, al0, ah0, cl, bx00, bx10); + + if (VARIANT == POW_V4) { + al0 ^= r0[2] | ((uint64_t)(r0[3]) << 32); + ah0 ^= r0[0] | ((uint64_t)(r0[1]) << 32); + } + + lo = __umul128(idx0, cl, &hi); + + SHUFFLE_V4(l0, (idx0&MASK), bx00, bx10, ax0, cx0) + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + bx10 = bx00; + bx00 = cx0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + + VARIANT4_RANDOM_MATH(1, al1, ah1, cl, bx01, bx11) + + if (VARIANT == POW_V4) { + al1 ^= r1[2] | ((uint64_t)(r1[3]) << 32); + ah1 ^= r1[0] | ((uint64_t)(r1[1]) << 32); + } + + lo = __umul128(idx1, cl, &hi); + + SHUFFLE_V4(l1, (idx1&MASK), bx01, bx11, ax1, cx1); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + bx11 = bx01; + bx01 = cx1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + + VARIANT4_RANDOM_MATH(2, al2, ah2, cl, bx02, bx12); + + if (VARIANT == POW_V4) { + al2 ^= r2[2] | ((uint64_t)(r2[3]) << 32); + ah2 ^= r2[0] | ((uint64_t)(r2[1]) << 32); + } + + lo = __umul128(idx2, cl, &hi); + + SHUFFLE_V4(l2, (idx2&MASK), bx02, bx12, ax2, cx2); + + al2 += hi; + ah2 += lo; + + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + bx12 = bx02; + bx02 = cx2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + + VARIANT4_RANDOM_MATH(3, al3, ah3, cl, bx03, bx13); + + if (VARIANT == POW_V4) { + al3 ^= r3[2] | ((uint64_t)(r3[3]) << 32); + ah3 ^= r3[0] | ((uint64_t)(r3[1]) << 32); + } + + lo = __umul128(idx3, cl, &hi); + + SHUFFLE_V4(l3, (idx3&MASK), bx03, bx13, ax3, cx3); + + al3 += hi; + ah3 += lo; + + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + bx13 = bx03; + bx03 = cx3; + + + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; + + VARIANT4_RANDOM_MATH(4, al4, ah4, cl, bx04, bx14); + + if (VARIANT == POW_V4) { + al4 ^= r4[2] | ((uint64_t)(r4[3]) << 32); + ah4 ^= r4[0] | ((uint64_t)(r4[1]) << 32); + } + + lo = __umul128(idx4, cl, &hi); + + SHUFFLE_V4(l4, (idx4&MASK), bx04, bx14, ax4, cx4); + + al4 += hi; + ah4 += lo; + + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; + + bx14 = bx04; + bx04 = cx4; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); + } + + inline static void hashPowV4_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + uint64_t height, + AsmOptimization asmOptimization) { // not supported } @@ -5773,14 +5991,6 @@ public: // not supported } - inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) - { - // not supported - } - inline static void hashHeavyTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, diff --git a/src/crypto/asm/CryptonightR_soft_aes_template.inc b/src/crypto/asm/CryptonightR_soft_aes_template.inc new file mode 100644 index 00000000..40c7874d --- /dev/null +++ b/src/crypto/asm/CryptonightR_soft_aes_template.inc @@ -0,0 +1,279 @@ +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end) + +ALIGN(64) +FN_PREFIX(CryptonightR_soft_aes_template_part1): + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 232 + + mov eax, [rcx+96] + mov ebx, [rcx+100] + mov esi, [rcx+104] + mov edx, [rcx+108] + mov [rsp+144], eax + mov [rsp+148], ebx + mov [rsp+152], esi + mov [rsp+156], edx + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+328], rax + movq xmm12, r11 + mov QWORD PTR [rsp+320], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 524288 + + ALIGN(64) +FN_PREFIX(CryptonightR_soft_aes_template_mainloop): + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+328] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + pxor xmm6, xmm1 + pxor xmm6, xmm0 + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [r13], xmm0 + + mov ebx, [rsp+144] + mov ebp, [rsp+152] + add ebx, [rsp+148] + add ebp, [rsp+156] + shl rbp, 32 + or rbx, rbp + + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + + mov [rsp+160], rbx + mov [rsp+168], rdi + mov [rsp+176], rbp + mov [rsp+184], r10 + mov r10, rsp + + mov ebx, [rsp+144] + mov esi, [rsp+148] + mov edi, [rsp+152] + mov ebp, [rsp+156] + + movd esp, xmm7 + movaps xmm0, xmm7 + psrldq xmm0, 8 + movd r15d, xmm0 + movd eax, xmm4 + movd edx, xmm5 + movaps xmm0, xmm5 + psrldq xmm0, 8 + movd r9d, xmm0 + +FN_PREFIX(CryptonightR_soft_aes_template_part2): + mov rsp, r10 + mov [rsp+144], ebx + mov [rsp+148], esi + mov [rsp+152], edi + mov [rsp+156], ebp + + mov edi, edi + shl rbp, 32 + or rbp, rdi + xor r8, rbp + + mov ebx, ebx + shl rsi, 32 + or rsi, rbx + xor QWORD PTR [rsp+320], rsi + + mov rbx, [rsp+160] + mov rdi, [rsp+168] + mov rbp, [rsp+176] + mov r10, [rsp+184] + + mov r9, r10 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm1 + paddq xmm1, xmm7 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm6, xmm0 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+320] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+304] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+320], r9 + mov QWORD PTR [rsp+328], rax + sub r12d, 1 + jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop) + +FN_PREFIX(CryptonightR_soft_aes_template_part3): + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 232 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +FN_PREFIX(CryptonightR_soft_aes_template_end): diff --git a/src/crypto/asm/CryptonightR_template.S b/src/crypto/asm/CryptonightR_template.S new file mode 100644 index 00000000..d2974d16 --- /dev/null +++ b/src/crypto/asm/CryptonightR_template.S @@ -0,0 +1,1595 @@ +#ifdef __APPLE__ +# define ALIGN(x) .align 6 +#else +# define ALIGN(x) .align 64 +#endif +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif + +#define PUBLIC .global + +PUBLIC FN_PREFIX(CryptonightR_instruction0) +PUBLIC FN_PREFIX(CryptonightR_instruction1) +PUBLIC FN_PREFIX(CryptonightR_instruction2) +PUBLIC FN_PREFIX(CryptonightR_instruction3) +PUBLIC FN_PREFIX(CryptonightR_instruction4) +PUBLIC FN_PREFIX(CryptonightR_instruction5) +PUBLIC FN_PREFIX(CryptonightR_instruction6) +PUBLIC FN_PREFIX(CryptonightR_instruction7) +PUBLIC FN_PREFIX(CryptonightR_instruction8) +PUBLIC FN_PREFIX(CryptonightR_instruction9) +PUBLIC FN_PREFIX(CryptonightR_instruction10) +PUBLIC FN_PREFIX(CryptonightR_instruction11) +PUBLIC FN_PREFIX(CryptonightR_instruction12) +PUBLIC FN_PREFIX(CryptonightR_instruction13) +PUBLIC FN_PREFIX(CryptonightR_instruction14) +PUBLIC FN_PREFIX(CryptonightR_instruction15) +PUBLIC FN_PREFIX(CryptonightR_instruction16) +PUBLIC FN_PREFIX(CryptonightR_instruction17) +PUBLIC FN_PREFIX(CryptonightR_instruction18) +PUBLIC FN_PREFIX(CryptonightR_instruction19) +PUBLIC FN_PREFIX(CryptonightR_instruction20) +PUBLIC FN_PREFIX(CryptonightR_instruction21) +PUBLIC FN_PREFIX(CryptonightR_instruction22) +PUBLIC FN_PREFIX(CryptonightR_instruction23) +PUBLIC FN_PREFIX(CryptonightR_instruction24) +PUBLIC FN_PREFIX(CryptonightR_instruction25) +PUBLIC FN_PREFIX(CryptonightR_instruction26) +PUBLIC FN_PREFIX(CryptonightR_instruction27) +PUBLIC FN_PREFIX(CryptonightR_instruction28) +PUBLIC FN_PREFIX(CryptonightR_instruction29) +PUBLIC FN_PREFIX(CryptonightR_instruction30) +PUBLIC FN_PREFIX(CryptonightR_instruction31) +PUBLIC FN_PREFIX(CryptonightR_instruction32) +PUBLIC FN_PREFIX(CryptonightR_instruction33) +PUBLIC FN_PREFIX(CryptonightR_instruction34) +PUBLIC FN_PREFIX(CryptonightR_instruction35) +PUBLIC FN_PREFIX(CryptonightR_instruction36) +PUBLIC FN_PREFIX(CryptonightR_instruction37) +PUBLIC FN_PREFIX(CryptonightR_instruction38) +PUBLIC FN_PREFIX(CryptonightR_instruction39) +PUBLIC FN_PREFIX(CryptonightR_instruction40) +PUBLIC FN_PREFIX(CryptonightR_instruction41) +PUBLIC FN_PREFIX(CryptonightR_instruction42) +PUBLIC FN_PREFIX(CryptonightR_instruction43) +PUBLIC FN_PREFIX(CryptonightR_instruction44) +PUBLIC FN_PREFIX(CryptonightR_instruction45) +PUBLIC FN_PREFIX(CryptonightR_instruction46) +PUBLIC FN_PREFIX(CryptonightR_instruction47) +PUBLIC FN_PREFIX(CryptonightR_instruction48) +PUBLIC FN_PREFIX(CryptonightR_instruction49) +PUBLIC FN_PREFIX(CryptonightR_instruction50) +PUBLIC FN_PREFIX(CryptonightR_instruction51) +PUBLIC FN_PREFIX(CryptonightR_instruction52) +PUBLIC FN_PREFIX(CryptonightR_instruction53) +PUBLIC FN_PREFIX(CryptonightR_instruction54) +PUBLIC FN_PREFIX(CryptonightR_instruction55) +PUBLIC FN_PREFIX(CryptonightR_instruction56) +PUBLIC FN_PREFIX(CryptonightR_instruction57) +PUBLIC FN_PREFIX(CryptonightR_instruction58) +PUBLIC FN_PREFIX(CryptonightR_instruction59) +PUBLIC FN_PREFIX(CryptonightR_instruction60) +PUBLIC FN_PREFIX(CryptonightR_instruction61) +PUBLIC FN_PREFIX(CryptonightR_instruction62) +PUBLIC FN_PREFIX(CryptonightR_instruction63) +PUBLIC FN_PREFIX(CryptonightR_instruction64) +PUBLIC FN_PREFIX(CryptonightR_instruction65) +PUBLIC FN_PREFIX(CryptonightR_instruction66) +PUBLIC FN_PREFIX(CryptonightR_instruction67) +PUBLIC FN_PREFIX(CryptonightR_instruction68) +PUBLIC FN_PREFIX(CryptonightR_instruction69) +PUBLIC FN_PREFIX(CryptonightR_instruction70) +PUBLIC FN_PREFIX(CryptonightR_instruction71) +PUBLIC FN_PREFIX(CryptonightR_instruction72) +PUBLIC FN_PREFIX(CryptonightR_instruction73) +PUBLIC FN_PREFIX(CryptonightR_instruction74) +PUBLIC FN_PREFIX(CryptonightR_instruction75) +PUBLIC FN_PREFIX(CryptonightR_instruction76) +PUBLIC FN_PREFIX(CryptonightR_instruction77) +PUBLIC FN_PREFIX(CryptonightR_instruction78) +PUBLIC FN_PREFIX(CryptonightR_instruction79) +PUBLIC FN_PREFIX(CryptonightR_instruction80) +PUBLIC FN_PREFIX(CryptonightR_instruction81) +PUBLIC FN_PREFIX(CryptonightR_instruction82) +PUBLIC FN_PREFIX(CryptonightR_instruction83) +PUBLIC FN_PREFIX(CryptonightR_instruction84) +PUBLIC FN_PREFIX(CryptonightR_instruction85) +PUBLIC FN_PREFIX(CryptonightR_instruction86) +PUBLIC FN_PREFIX(CryptonightR_instruction87) +PUBLIC FN_PREFIX(CryptonightR_instruction88) +PUBLIC FN_PREFIX(CryptonightR_instruction89) +PUBLIC FN_PREFIX(CryptonightR_instruction90) +PUBLIC FN_PREFIX(CryptonightR_instruction91) +PUBLIC FN_PREFIX(CryptonightR_instruction92) +PUBLIC FN_PREFIX(CryptonightR_instruction93) +PUBLIC FN_PREFIX(CryptonightR_instruction94) +PUBLIC FN_PREFIX(CryptonightR_instruction95) +PUBLIC FN_PREFIX(CryptonightR_instruction96) +PUBLIC FN_PREFIX(CryptonightR_instruction97) +PUBLIC FN_PREFIX(CryptonightR_instruction98) +PUBLIC FN_PREFIX(CryptonightR_instruction99) +PUBLIC FN_PREFIX(CryptonightR_instruction100) +PUBLIC FN_PREFIX(CryptonightR_instruction101) +PUBLIC FN_PREFIX(CryptonightR_instruction102) +PUBLIC FN_PREFIX(CryptonightR_instruction103) +PUBLIC FN_PREFIX(CryptonightR_instruction104) +PUBLIC FN_PREFIX(CryptonightR_instruction105) +PUBLIC FN_PREFIX(CryptonightR_instruction106) +PUBLIC FN_PREFIX(CryptonightR_instruction107) +PUBLIC FN_PREFIX(CryptonightR_instruction108) +PUBLIC FN_PREFIX(CryptonightR_instruction109) +PUBLIC FN_PREFIX(CryptonightR_instruction110) +PUBLIC FN_PREFIX(CryptonightR_instruction111) +PUBLIC FN_PREFIX(CryptonightR_instruction112) +PUBLIC FN_PREFIX(CryptonightR_instruction113) +PUBLIC FN_PREFIX(CryptonightR_instruction114) +PUBLIC FN_PREFIX(CryptonightR_instruction115) +PUBLIC FN_PREFIX(CryptonightR_instruction116) +PUBLIC FN_PREFIX(CryptonightR_instruction117) +PUBLIC FN_PREFIX(CryptonightR_instruction118) +PUBLIC FN_PREFIX(CryptonightR_instruction119) +PUBLIC FN_PREFIX(CryptonightR_instruction120) +PUBLIC FN_PREFIX(CryptonightR_instruction121) +PUBLIC FN_PREFIX(CryptonightR_instruction122) +PUBLIC FN_PREFIX(CryptonightR_instruction123) +PUBLIC FN_PREFIX(CryptonightR_instruction124) +PUBLIC FN_PREFIX(CryptonightR_instruction125) +PUBLIC FN_PREFIX(CryptonightR_instruction126) +PUBLIC FN_PREFIX(CryptonightR_instruction127) +PUBLIC FN_PREFIX(CryptonightR_instruction128) +PUBLIC FN_PREFIX(CryptonightR_instruction129) +PUBLIC FN_PREFIX(CryptonightR_instruction130) +PUBLIC FN_PREFIX(CryptonightR_instruction131) +PUBLIC FN_PREFIX(CryptonightR_instruction132) +PUBLIC FN_PREFIX(CryptonightR_instruction133) +PUBLIC FN_PREFIX(CryptonightR_instruction134) +PUBLIC FN_PREFIX(CryptonightR_instruction135) +PUBLIC FN_PREFIX(CryptonightR_instruction136) +PUBLIC FN_PREFIX(CryptonightR_instruction137) +PUBLIC FN_PREFIX(CryptonightR_instruction138) +PUBLIC FN_PREFIX(CryptonightR_instruction139) +PUBLIC FN_PREFIX(CryptonightR_instruction140) +PUBLIC FN_PREFIX(CryptonightR_instruction141) +PUBLIC FN_PREFIX(CryptonightR_instruction142) +PUBLIC FN_PREFIX(CryptonightR_instruction143) +PUBLIC FN_PREFIX(CryptonightR_instruction144) +PUBLIC FN_PREFIX(CryptonightR_instruction145) +PUBLIC FN_PREFIX(CryptonightR_instruction146) +PUBLIC FN_PREFIX(CryptonightR_instruction147) +PUBLIC FN_PREFIX(CryptonightR_instruction148) +PUBLIC FN_PREFIX(CryptonightR_instruction149) +PUBLIC FN_PREFIX(CryptonightR_instruction150) +PUBLIC FN_PREFIX(CryptonightR_instruction151) +PUBLIC FN_PREFIX(CryptonightR_instruction152) +PUBLIC FN_PREFIX(CryptonightR_instruction153) +PUBLIC FN_PREFIX(CryptonightR_instruction154) +PUBLIC FN_PREFIX(CryptonightR_instruction155) +PUBLIC FN_PREFIX(CryptonightR_instruction156) +PUBLIC FN_PREFIX(CryptonightR_instruction157) +PUBLIC FN_PREFIX(CryptonightR_instruction158) +PUBLIC FN_PREFIX(CryptonightR_instruction159) +PUBLIC FN_PREFIX(CryptonightR_instruction160) +PUBLIC FN_PREFIX(CryptonightR_instruction161) +PUBLIC FN_PREFIX(CryptonightR_instruction162) +PUBLIC FN_PREFIX(CryptonightR_instruction163) +PUBLIC FN_PREFIX(CryptonightR_instruction164) +PUBLIC FN_PREFIX(CryptonightR_instruction165) +PUBLIC FN_PREFIX(CryptonightR_instruction166) +PUBLIC FN_PREFIX(CryptonightR_instruction167) +PUBLIC FN_PREFIX(CryptonightR_instruction168) +PUBLIC FN_PREFIX(CryptonightR_instruction169) +PUBLIC FN_PREFIX(CryptonightR_instruction170) +PUBLIC FN_PREFIX(CryptonightR_instruction171) +PUBLIC FN_PREFIX(CryptonightR_instruction172) +PUBLIC FN_PREFIX(CryptonightR_instruction173) +PUBLIC FN_PREFIX(CryptonightR_instruction174) +PUBLIC FN_PREFIX(CryptonightR_instruction175) +PUBLIC FN_PREFIX(CryptonightR_instruction176) +PUBLIC FN_PREFIX(CryptonightR_instruction177) +PUBLIC FN_PREFIX(CryptonightR_instruction178) +PUBLIC FN_PREFIX(CryptonightR_instruction179) +PUBLIC FN_PREFIX(CryptonightR_instruction180) +PUBLIC FN_PREFIX(CryptonightR_instruction181) +PUBLIC FN_PREFIX(CryptonightR_instruction182) +PUBLIC FN_PREFIX(CryptonightR_instruction183) +PUBLIC FN_PREFIX(CryptonightR_instruction184) +PUBLIC FN_PREFIX(CryptonightR_instruction185) +PUBLIC FN_PREFIX(CryptonightR_instruction186) +PUBLIC FN_PREFIX(CryptonightR_instruction187) +PUBLIC FN_PREFIX(CryptonightR_instruction188) +PUBLIC FN_PREFIX(CryptonightR_instruction189) +PUBLIC FN_PREFIX(CryptonightR_instruction190) +PUBLIC FN_PREFIX(CryptonightR_instruction191) +PUBLIC FN_PREFIX(CryptonightR_instruction192) +PUBLIC FN_PREFIX(CryptonightR_instruction193) +PUBLIC FN_PREFIX(CryptonightR_instruction194) +PUBLIC FN_PREFIX(CryptonightR_instruction195) +PUBLIC FN_PREFIX(CryptonightR_instruction196) +PUBLIC FN_PREFIX(CryptonightR_instruction197) +PUBLIC FN_PREFIX(CryptonightR_instruction198) +PUBLIC FN_PREFIX(CryptonightR_instruction199) +PUBLIC FN_PREFIX(CryptonightR_instruction200) +PUBLIC FN_PREFIX(CryptonightR_instruction201) +PUBLIC FN_PREFIX(CryptonightR_instruction202) +PUBLIC FN_PREFIX(CryptonightR_instruction203) +PUBLIC FN_PREFIX(CryptonightR_instruction204) +PUBLIC FN_PREFIX(CryptonightR_instruction205) +PUBLIC FN_PREFIX(CryptonightR_instruction206) +PUBLIC FN_PREFIX(CryptonightR_instruction207) +PUBLIC FN_PREFIX(CryptonightR_instruction208) +PUBLIC FN_PREFIX(CryptonightR_instruction209) +PUBLIC FN_PREFIX(CryptonightR_instruction210) +PUBLIC FN_PREFIX(CryptonightR_instruction211) +PUBLIC FN_PREFIX(CryptonightR_instruction212) +PUBLIC FN_PREFIX(CryptonightR_instruction213) +PUBLIC FN_PREFIX(CryptonightR_instruction214) +PUBLIC FN_PREFIX(CryptonightR_instruction215) +PUBLIC FN_PREFIX(CryptonightR_instruction216) +PUBLIC FN_PREFIX(CryptonightR_instruction217) +PUBLIC FN_PREFIX(CryptonightR_instruction218) +PUBLIC FN_PREFIX(CryptonightR_instruction219) +PUBLIC FN_PREFIX(CryptonightR_instruction220) +PUBLIC FN_PREFIX(CryptonightR_instruction221) +PUBLIC FN_PREFIX(CryptonightR_instruction222) +PUBLIC FN_PREFIX(CryptonightR_instruction223) +PUBLIC FN_PREFIX(CryptonightR_instruction224) +PUBLIC FN_PREFIX(CryptonightR_instruction225) +PUBLIC FN_PREFIX(CryptonightR_instruction226) +PUBLIC FN_PREFIX(CryptonightR_instruction227) +PUBLIC FN_PREFIX(CryptonightR_instruction228) +PUBLIC FN_PREFIX(CryptonightR_instruction229) +PUBLIC FN_PREFIX(CryptonightR_instruction230) +PUBLIC FN_PREFIX(CryptonightR_instruction231) +PUBLIC FN_PREFIX(CryptonightR_instruction232) +PUBLIC FN_PREFIX(CryptonightR_instruction233) +PUBLIC FN_PREFIX(CryptonightR_instruction234) +PUBLIC FN_PREFIX(CryptonightR_instruction235) +PUBLIC FN_PREFIX(CryptonightR_instruction236) +PUBLIC FN_PREFIX(CryptonightR_instruction237) +PUBLIC FN_PREFIX(CryptonightR_instruction238) +PUBLIC FN_PREFIX(CryptonightR_instruction239) +PUBLIC FN_PREFIX(CryptonightR_instruction240) +PUBLIC FN_PREFIX(CryptonightR_instruction241) +PUBLIC FN_PREFIX(CryptonightR_instruction242) +PUBLIC FN_PREFIX(CryptonightR_instruction243) +PUBLIC FN_PREFIX(CryptonightR_instruction244) +PUBLIC FN_PREFIX(CryptonightR_instruction245) +PUBLIC FN_PREFIX(CryptonightR_instruction246) +PUBLIC FN_PREFIX(CryptonightR_instruction247) +PUBLIC FN_PREFIX(CryptonightR_instruction248) +PUBLIC FN_PREFIX(CryptonightR_instruction249) +PUBLIC FN_PREFIX(CryptonightR_instruction250) +PUBLIC FN_PREFIX(CryptonightR_instruction251) +PUBLIC FN_PREFIX(CryptonightR_instruction252) +PUBLIC FN_PREFIX(CryptonightR_instruction253) +PUBLIC FN_PREFIX(CryptonightR_instruction254) +PUBLIC FN_PREFIX(CryptonightR_instruction255) +PUBLIC FN_PREFIX(CryptonightR_instruction256) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov0) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov1) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov2) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov3) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov4) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov5) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov6) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov7) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov8) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov9) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov10) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov11) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov12) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov13) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov14) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov15) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov16) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov17) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov18) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov19) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov20) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov21) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov22) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov23) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov24) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov25) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov26) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov27) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov28) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov29) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov30) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov31) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov32) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov33) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov34) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov35) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov36) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov37) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov38) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov39) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov40) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov41) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov42) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov43) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov44) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov45) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov46) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov47) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov48) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov49) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov50) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov51) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov52) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov53) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov54) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov55) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov56) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov57) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov58) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov59) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov60) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov61) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov62) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov63) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov64) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov65) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov66) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov67) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov68) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov69) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov70) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov71) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov72) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov73) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov74) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov75) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov76) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov77) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov78) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov79) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov80) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov81) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov82) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov83) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov84) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov85) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov86) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov87) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov88) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov89) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov90) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov91) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov92) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov93) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov94) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov95) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov96) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov97) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov98) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov99) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov100) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov101) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov102) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov103) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov104) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov105) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov106) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov107) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov108) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov109) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov110) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov111) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov112) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov113) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov114) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov115) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov116) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov117) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov118) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov119) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov120) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov121) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov122) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov123) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov124) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov125) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov126) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov127) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov128) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov129) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov130) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov131) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov132) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov133) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov134) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov135) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov136) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov137) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov138) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov139) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov140) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov141) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov142) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov143) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov144) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov145) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov146) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov147) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov148) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov149) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov150) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov151) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov152) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov153) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov154) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov155) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov156) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov157) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov158) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov159) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov160) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov161) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov162) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov163) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov164) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov165) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov166) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov167) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov168) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov169) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov170) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov171) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov172) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov173) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov174) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov175) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov176) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov177) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov178) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov179) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov180) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov181) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov182) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov183) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov184) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov185) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov186) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov187) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov188) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov189) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov190) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov191) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov192) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov193) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov194) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov195) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov196) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov197) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov198) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov199) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov200) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov201) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov202) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov203) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov204) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov205) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov206) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov207) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov208) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov209) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov210) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov211) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov212) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov213) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov214) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov215) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov216) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov217) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov218) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov219) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov220) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov221) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov222) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov223) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov224) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov225) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov226) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov227) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov228) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov229) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov230) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov231) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov232) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov233) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov234) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov235) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov236) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov237) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov238) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov239) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov240) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov241) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov242) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov243) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov244) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov245) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov246) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov247) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov248) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov249) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov250) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov251) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov252) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov253) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov254) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov255) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov256) + +#include "CryptonightWOW_template.inc" +#include "CryptonightR_template.inc" +#include "CryptonightWOW_soft_aes_template.inc" +#include "CryptonightR_soft_aes_template.inc" + +FN_PREFIX(CryptonightR_instruction0): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction1): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction2): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction3): + add rbx, r9 + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction4): + sub rbx, r9 +FN_PREFIX(CryptonightR_instruction5): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction6): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction7): + xor rbx, r9 +FN_PREFIX(CryptonightR_instruction8): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction9): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction10): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction11): + add rsi, rbx + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction12): + sub rsi, rbx +FN_PREFIX(CryptonightR_instruction13): + ror esi, cl +FN_PREFIX(CryptonightR_instruction14): + rol esi, cl +FN_PREFIX(CryptonightR_instruction15): + xor rsi, rbx +FN_PREFIX(CryptonightR_instruction16): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction17): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction18): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction19): + add rdi, rbx + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction20): + sub rdi, rbx +FN_PREFIX(CryptonightR_instruction21): + ror edi, cl +FN_PREFIX(CryptonightR_instruction22): + rol edi, cl +FN_PREFIX(CryptonightR_instruction23): + xor rdi, rbx +FN_PREFIX(CryptonightR_instruction24): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction25): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction26): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction27): + add rbp, rbx + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction28): + sub rbp, rbx +FN_PREFIX(CryptonightR_instruction29): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction30): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction31): + xor rbp, rbx +FN_PREFIX(CryptonightR_instruction32): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction33): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction34): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction35): + add rbx, rsi + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction36): + sub rbx, rsi +FN_PREFIX(CryptonightR_instruction37): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction38): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction39): + xor rbx, rsi +FN_PREFIX(CryptonightR_instruction40): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction41): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction42): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction43): + add rsi, r9 + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction44): + sub rsi, r9 +FN_PREFIX(CryptonightR_instruction45): + ror esi, cl +FN_PREFIX(CryptonightR_instruction46): + rol esi, cl +FN_PREFIX(CryptonightR_instruction47): + xor rsi, r9 +FN_PREFIX(CryptonightR_instruction48): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction49): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction50): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction51): + add rdi, rsi + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction52): + sub rdi, rsi +FN_PREFIX(CryptonightR_instruction53): + ror edi, cl +FN_PREFIX(CryptonightR_instruction54): + rol edi, cl +FN_PREFIX(CryptonightR_instruction55): + xor rdi, rsi +FN_PREFIX(CryptonightR_instruction56): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction57): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction58): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction59): + add rbp, rsi + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction60): + sub rbp, rsi +FN_PREFIX(CryptonightR_instruction61): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction62): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction63): + xor rbp, rsi +FN_PREFIX(CryptonightR_instruction64): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction65): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction66): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction67): + add rbx, rdi + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction68): + sub rbx, rdi +FN_PREFIX(CryptonightR_instruction69): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction70): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction71): + xor rbx, rdi +FN_PREFIX(CryptonightR_instruction72): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction73): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction74): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction75): + add rsi, rdi + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction76): + sub rsi, rdi +FN_PREFIX(CryptonightR_instruction77): + ror esi, cl +FN_PREFIX(CryptonightR_instruction78): + rol esi, cl +FN_PREFIX(CryptonightR_instruction79): + xor rsi, rdi +FN_PREFIX(CryptonightR_instruction80): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction81): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction82): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction83): + add rdi, r9 + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction84): + sub rdi, r9 +FN_PREFIX(CryptonightR_instruction85): + ror edi, cl +FN_PREFIX(CryptonightR_instruction86): + rol edi, cl +FN_PREFIX(CryptonightR_instruction87): + xor rdi, r9 +FN_PREFIX(CryptonightR_instruction88): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction89): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction90): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction91): + add rbp, rdi + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction92): + sub rbp, rdi +FN_PREFIX(CryptonightR_instruction93): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction94): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction95): + xor rbp, rdi +FN_PREFIX(CryptonightR_instruction96): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction97): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction98): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction99): + add rbx, rbp + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction100): + sub rbx, rbp +FN_PREFIX(CryptonightR_instruction101): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction102): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction103): + xor rbx, rbp +FN_PREFIX(CryptonightR_instruction104): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction105): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction106): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction107): + add rsi, rbp + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction108): + sub rsi, rbp +FN_PREFIX(CryptonightR_instruction109): + ror esi, cl +FN_PREFIX(CryptonightR_instruction110): + rol esi, cl +FN_PREFIX(CryptonightR_instruction111): + xor rsi, rbp +FN_PREFIX(CryptonightR_instruction112): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction113): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction114): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction115): + add rdi, rbp + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction116): + sub rdi, rbp +FN_PREFIX(CryptonightR_instruction117): + ror edi, cl +FN_PREFIX(CryptonightR_instruction118): + rol edi, cl +FN_PREFIX(CryptonightR_instruction119): + xor rdi, rbp +FN_PREFIX(CryptonightR_instruction120): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction121): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction122): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction123): + add rbp, r9 + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction124): + sub rbp, r9 +FN_PREFIX(CryptonightR_instruction125): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction126): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction127): + xor rbp, r9 +FN_PREFIX(CryptonightR_instruction128): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction129): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction130): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction131): + add rbx, rsp + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction132): + sub rbx, rsp +FN_PREFIX(CryptonightR_instruction133): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction134): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction135): + xor rbx, rsp +FN_PREFIX(CryptonightR_instruction136): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction137): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction138): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction139): + add rsi, rsp + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction140): + sub rsi, rsp +FN_PREFIX(CryptonightR_instruction141): + ror esi, cl +FN_PREFIX(CryptonightR_instruction142): + rol esi, cl +FN_PREFIX(CryptonightR_instruction143): + xor rsi, rsp +FN_PREFIX(CryptonightR_instruction144): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction145): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction146): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction147): + add rdi, rsp + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction148): + sub rdi, rsp +FN_PREFIX(CryptonightR_instruction149): + ror edi, cl +FN_PREFIX(CryptonightR_instruction150): + rol edi, cl +FN_PREFIX(CryptonightR_instruction151): + xor rdi, rsp +FN_PREFIX(CryptonightR_instruction152): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction153): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction154): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction155): + add rbp, rsp + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction156): + sub rbp, rsp +FN_PREFIX(CryptonightR_instruction157): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction158): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction159): + xor rbp, rsp +FN_PREFIX(CryptonightR_instruction160): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction161): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction162): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction163): + add rbx, r15 + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction164): + sub rbx, r15 +FN_PREFIX(CryptonightR_instruction165): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction166): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction167): + xor rbx, r15 +FN_PREFIX(CryptonightR_instruction168): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction169): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction170): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction171): + add rsi, r15 + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction172): + sub rsi, r15 +FN_PREFIX(CryptonightR_instruction173): + ror esi, cl +FN_PREFIX(CryptonightR_instruction174): + rol esi, cl +FN_PREFIX(CryptonightR_instruction175): + xor rsi, r15 +FN_PREFIX(CryptonightR_instruction176): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction177): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction178): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction179): + add rdi, r15 + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction180): + sub rdi, r15 +FN_PREFIX(CryptonightR_instruction181): + ror edi, cl +FN_PREFIX(CryptonightR_instruction182): + rol edi, cl +FN_PREFIX(CryptonightR_instruction183): + xor rdi, r15 +FN_PREFIX(CryptonightR_instruction184): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction185): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction186): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction187): + add rbp, r15 + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction188): + sub rbp, r15 +FN_PREFIX(CryptonightR_instruction189): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction190): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction191): + xor rbp, r15 +FN_PREFIX(CryptonightR_instruction192): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction193): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction194): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction195): + add rbx, rax + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction196): + sub rbx, rax +FN_PREFIX(CryptonightR_instruction197): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction198): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction199): + xor rbx, rax +FN_PREFIX(CryptonightR_instruction200): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction201): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction202): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction203): + add rsi, rax + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction204): + sub rsi, rax +FN_PREFIX(CryptonightR_instruction205): + ror esi, cl +FN_PREFIX(CryptonightR_instruction206): + rol esi, cl +FN_PREFIX(CryptonightR_instruction207): + xor rsi, rax +FN_PREFIX(CryptonightR_instruction208): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction209): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction210): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction211): + add rdi, rax + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction212): + sub rdi, rax +FN_PREFIX(CryptonightR_instruction213): + ror edi, cl +FN_PREFIX(CryptonightR_instruction214): + rol edi, cl +FN_PREFIX(CryptonightR_instruction215): + xor rdi, rax +FN_PREFIX(CryptonightR_instruction216): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction217): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction218): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction219): + add rbp, rax + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction220): + sub rbp, rax +FN_PREFIX(CryptonightR_instruction221): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction222): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction223): + xor rbp, rax +FN_PREFIX(CryptonightR_instruction224): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction225): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction226): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction227): + add rbx, rdx + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction228): + sub rbx, rdx +FN_PREFIX(CryptonightR_instruction229): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction230): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction231): + xor rbx, rdx +FN_PREFIX(CryptonightR_instruction232): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction233): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction234): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction235): + add rsi, rdx + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction236): + sub rsi, rdx +FN_PREFIX(CryptonightR_instruction237): + ror esi, cl +FN_PREFIX(CryptonightR_instruction238): + rol esi, cl +FN_PREFIX(CryptonightR_instruction239): + xor rsi, rdx +FN_PREFIX(CryptonightR_instruction240): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction241): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction242): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction243): + add rdi, rdx + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction244): + sub rdi, rdx +FN_PREFIX(CryptonightR_instruction245): + ror edi, cl +FN_PREFIX(CryptonightR_instruction246): + rol edi, cl +FN_PREFIX(CryptonightR_instruction247): + xor rdi, rdx +FN_PREFIX(CryptonightR_instruction248): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction249): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction250): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction251): + add rbp, rdx + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction252): + sub rbp, rdx +FN_PREFIX(CryptonightR_instruction253): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction254): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction255): + xor rbp, rdx +FN_PREFIX(CryptonightR_instruction256): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction_mov0): + +FN_PREFIX(CryptonightR_instruction_mov1): + +FN_PREFIX(CryptonightR_instruction_mov2): + +FN_PREFIX(CryptonightR_instruction_mov3): + +FN_PREFIX(CryptonightR_instruction_mov4): + +FN_PREFIX(CryptonightR_instruction_mov5): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov6): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov7): + +FN_PREFIX(CryptonightR_instruction_mov8): + +FN_PREFIX(CryptonightR_instruction_mov9): + +FN_PREFIX(CryptonightR_instruction_mov10): + +FN_PREFIX(CryptonightR_instruction_mov11): + +FN_PREFIX(CryptonightR_instruction_mov12): + +FN_PREFIX(CryptonightR_instruction_mov13): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov14): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov15): + +FN_PREFIX(CryptonightR_instruction_mov16): + +FN_PREFIX(CryptonightR_instruction_mov17): + +FN_PREFIX(CryptonightR_instruction_mov18): + +FN_PREFIX(CryptonightR_instruction_mov19): + +FN_PREFIX(CryptonightR_instruction_mov20): + +FN_PREFIX(CryptonightR_instruction_mov21): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov22): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov23): + +FN_PREFIX(CryptonightR_instruction_mov24): + +FN_PREFIX(CryptonightR_instruction_mov25): + +FN_PREFIX(CryptonightR_instruction_mov26): + +FN_PREFIX(CryptonightR_instruction_mov27): + +FN_PREFIX(CryptonightR_instruction_mov28): + +FN_PREFIX(CryptonightR_instruction_mov29): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov30): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov31): + +FN_PREFIX(CryptonightR_instruction_mov32): + +FN_PREFIX(CryptonightR_instruction_mov33): + +FN_PREFIX(CryptonightR_instruction_mov34): + +FN_PREFIX(CryptonightR_instruction_mov35): + +FN_PREFIX(CryptonightR_instruction_mov36): + +FN_PREFIX(CryptonightR_instruction_mov37): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov38): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov39): + +FN_PREFIX(CryptonightR_instruction_mov40): + +FN_PREFIX(CryptonightR_instruction_mov41): + +FN_PREFIX(CryptonightR_instruction_mov42): + +FN_PREFIX(CryptonightR_instruction_mov43): + +FN_PREFIX(CryptonightR_instruction_mov44): + +FN_PREFIX(CryptonightR_instruction_mov45): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov46): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov47): + +FN_PREFIX(CryptonightR_instruction_mov48): + +FN_PREFIX(CryptonightR_instruction_mov49): + +FN_PREFIX(CryptonightR_instruction_mov50): + +FN_PREFIX(CryptonightR_instruction_mov51): + +FN_PREFIX(CryptonightR_instruction_mov52): + +FN_PREFIX(CryptonightR_instruction_mov53): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov54): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov55): + +FN_PREFIX(CryptonightR_instruction_mov56): + +FN_PREFIX(CryptonightR_instruction_mov57): + +FN_PREFIX(CryptonightR_instruction_mov58): + +FN_PREFIX(CryptonightR_instruction_mov59): + +FN_PREFIX(CryptonightR_instruction_mov60): + +FN_PREFIX(CryptonightR_instruction_mov61): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov62): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov63): + +FN_PREFIX(CryptonightR_instruction_mov64): + +FN_PREFIX(CryptonightR_instruction_mov65): + +FN_PREFIX(CryptonightR_instruction_mov66): + +FN_PREFIX(CryptonightR_instruction_mov67): + +FN_PREFIX(CryptonightR_instruction_mov68): + +FN_PREFIX(CryptonightR_instruction_mov69): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov70): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov71): + +FN_PREFIX(CryptonightR_instruction_mov72): + +FN_PREFIX(CryptonightR_instruction_mov73): + +FN_PREFIX(CryptonightR_instruction_mov74): + +FN_PREFIX(CryptonightR_instruction_mov75): + +FN_PREFIX(CryptonightR_instruction_mov76): + +FN_PREFIX(CryptonightR_instruction_mov77): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov78): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov79): + +FN_PREFIX(CryptonightR_instruction_mov80): + +FN_PREFIX(CryptonightR_instruction_mov81): + +FN_PREFIX(CryptonightR_instruction_mov82): + +FN_PREFIX(CryptonightR_instruction_mov83): + +FN_PREFIX(CryptonightR_instruction_mov84): + +FN_PREFIX(CryptonightR_instruction_mov85): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov86): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov87): + +FN_PREFIX(CryptonightR_instruction_mov88): + +FN_PREFIX(CryptonightR_instruction_mov89): + +FN_PREFIX(CryptonightR_instruction_mov90): + +FN_PREFIX(CryptonightR_instruction_mov91): + +FN_PREFIX(CryptonightR_instruction_mov92): + +FN_PREFIX(CryptonightR_instruction_mov93): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov94): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov95): + +FN_PREFIX(CryptonightR_instruction_mov96): + +FN_PREFIX(CryptonightR_instruction_mov97): + +FN_PREFIX(CryptonightR_instruction_mov98): + +FN_PREFIX(CryptonightR_instruction_mov99): + +FN_PREFIX(CryptonightR_instruction_mov100): + +FN_PREFIX(CryptonightR_instruction_mov101): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov102): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov103): + +FN_PREFIX(CryptonightR_instruction_mov104): + +FN_PREFIX(CryptonightR_instruction_mov105): + +FN_PREFIX(CryptonightR_instruction_mov106): + +FN_PREFIX(CryptonightR_instruction_mov107): + +FN_PREFIX(CryptonightR_instruction_mov108): + +FN_PREFIX(CryptonightR_instruction_mov109): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov110): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov111): + +FN_PREFIX(CryptonightR_instruction_mov112): + +FN_PREFIX(CryptonightR_instruction_mov113): + +FN_PREFIX(CryptonightR_instruction_mov114): + +FN_PREFIX(CryptonightR_instruction_mov115): + +FN_PREFIX(CryptonightR_instruction_mov116): + +FN_PREFIX(CryptonightR_instruction_mov117): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov118): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov119): + +FN_PREFIX(CryptonightR_instruction_mov120): + +FN_PREFIX(CryptonightR_instruction_mov121): + +FN_PREFIX(CryptonightR_instruction_mov122): + +FN_PREFIX(CryptonightR_instruction_mov123): + +FN_PREFIX(CryptonightR_instruction_mov124): + +FN_PREFIX(CryptonightR_instruction_mov125): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov126): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov127): + +FN_PREFIX(CryptonightR_instruction_mov128): + +FN_PREFIX(CryptonightR_instruction_mov129): + +FN_PREFIX(CryptonightR_instruction_mov130): + +FN_PREFIX(CryptonightR_instruction_mov131): + +FN_PREFIX(CryptonightR_instruction_mov132): + +FN_PREFIX(CryptonightR_instruction_mov133): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov134): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov135): + +FN_PREFIX(CryptonightR_instruction_mov136): + +FN_PREFIX(CryptonightR_instruction_mov137): + +FN_PREFIX(CryptonightR_instruction_mov138): + +FN_PREFIX(CryptonightR_instruction_mov139): + +FN_PREFIX(CryptonightR_instruction_mov140): + +FN_PREFIX(CryptonightR_instruction_mov141): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov142): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov143): + +FN_PREFIX(CryptonightR_instruction_mov144): + +FN_PREFIX(CryptonightR_instruction_mov145): + +FN_PREFIX(CryptonightR_instruction_mov146): + +FN_PREFIX(CryptonightR_instruction_mov147): + +FN_PREFIX(CryptonightR_instruction_mov148): + +FN_PREFIX(CryptonightR_instruction_mov149): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov150): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov151): + +FN_PREFIX(CryptonightR_instruction_mov152): + +FN_PREFIX(CryptonightR_instruction_mov153): + +FN_PREFIX(CryptonightR_instruction_mov154): + +FN_PREFIX(CryptonightR_instruction_mov155): + +FN_PREFIX(CryptonightR_instruction_mov156): + +FN_PREFIX(CryptonightR_instruction_mov157): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov158): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov159): + +FN_PREFIX(CryptonightR_instruction_mov160): + +FN_PREFIX(CryptonightR_instruction_mov161): + +FN_PREFIX(CryptonightR_instruction_mov162): + +FN_PREFIX(CryptonightR_instruction_mov163): + +FN_PREFIX(CryptonightR_instruction_mov164): + +FN_PREFIX(CryptonightR_instruction_mov165): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov166): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov167): + +FN_PREFIX(CryptonightR_instruction_mov168): + +FN_PREFIX(CryptonightR_instruction_mov169): + +FN_PREFIX(CryptonightR_instruction_mov170): + +FN_PREFIX(CryptonightR_instruction_mov171): + +FN_PREFIX(CryptonightR_instruction_mov172): + +FN_PREFIX(CryptonightR_instruction_mov173): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov174): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov175): + +FN_PREFIX(CryptonightR_instruction_mov176): + +FN_PREFIX(CryptonightR_instruction_mov177): + +FN_PREFIX(CryptonightR_instruction_mov178): + +FN_PREFIX(CryptonightR_instruction_mov179): + +FN_PREFIX(CryptonightR_instruction_mov180): + +FN_PREFIX(CryptonightR_instruction_mov181): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov182): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov183): + +FN_PREFIX(CryptonightR_instruction_mov184): + +FN_PREFIX(CryptonightR_instruction_mov185): + +FN_PREFIX(CryptonightR_instruction_mov186): + +FN_PREFIX(CryptonightR_instruction_mov187): + +FN_PREFIX(CryptonightR_instruction_mov188): + +FN_PREFIX(CryptonightR_instruction_mov189): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov190): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov191): + +FN_PREFIX(CryptonightR_instruction_mov192): + +FN_PREFIX(CryptonightR_instruction_mov193): + +FN_PREFIX(CryptonightR_instruction_mov194): + +FN_PREFIX(CryptonightR_instruction_mov195): + +FN_PREFIX(CryptonightR_instruction_mov196): + +FN_PREFIX(CryptonightR_instruction_mov197): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov198): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov199): + +FN_PREFIX(CryptonightR_instruction_mov200): + +FN_PREFIX(CryptonightR_instruction_mov201): + +FN_PREFIX(CryptonightR_instruction_mov202): + +FN_PREFIX(CryptonightR_instruction_mov203): + +FN_PREFIX(CryptonightR_instruction_mov204): + +FN_PREFIX(CryptonightR_instruction_mov205): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov206): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov207): + +FN_PREFIX(CryptonightR_instruction_mov208): + +FN_PREFIX(CryptonightR_instruction_mov209): + +FN_PREFIX(CryptonightR_instruction_mov210): + +FN_PREFIX(CryptonightR_instruction_mov211): + +FN_PREFIX(CryptonightR_instruction_mov212): + +FN_PREFIX(CryptonightR_instruction_mov213): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov214): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov215): + +FN_PREFIX(CryptonightR_instruction_mov216): + +FN_PREFIX(CryptonightR_instruction_mov217): + +FN_PREFIX(CryptonightR_instruction_mov218): + +FN_PREFIX(CryptonightR_instruction_mov219): + +FN_PREFIX(CryptonightR_instruction_mov220): + +FN_PREFIX(CryptonightR_instruction_mov221): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov222): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov223): + +FN_PREFIX(CryptonightR_instruction_mov224): + +FN_PREFIX(CryptonightR_instruction_mov225): + +FN_PREFIX(CryptonightR_instruction_mov226): + +FN_PREFIX(CryptonightR_instruction_mov227): + +FN_PREFIX(CryptonightR_instruction_mov228): + +FN_PREFIX(CryptonightR_instruction_mov229): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov230): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov231): + +FN_PREFIX(CryptonightR_instruction_mov232): + +FN_PREFIX(CryptonightR_instruction_mov233): + +FN_PREFIX(CryptonightR_instruction_mov234): + +FN_PREFIX(CryptonightR_instruction_mov235): + +FN_PREFIX(CryptonightR_instruction_mov236): + +FN_PREFIX(CryptonightR_instruction_mov237): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov238): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov239): + +FN_PREFIX(CryptonightR_instruction_mov240): + +FN_PREFIX(CryptonightR_instruction_mov241): + +FN_PREFIX(CryptonightR_instruction_mov242): + +FN_PREFIX(CryptonightR_instruction_mov243): + +FN_PREFIX(CryptonightR_instruction_mov244): + +FN_PREFIX(CryptonightR_instruction_mov245): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov246): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov247): + +FN_PREFIX(CryptonightR_instruction_mov248): + +FN_PREFIX(CryptonightR_instruction_mov249): + +FN_PREFIX(CryptonightR_instruction_mov250): + +FN_PREFIX(CryptonightR_instruction_mov251): + +FN_PREFIX(CryptonightR_instruction_mov252): + +FN_PREFIX(CryptonightR_instruction_mov253): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov254): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov255): + +FN_PREFIX(CryptonightR_instruction_mov256): diff --git a/src/crypto/asm/CryptonightR_template.h b/src/crypto/asm/CryptonightR_template.h new file mode 100644 index 00000000..d9159a8f --- /dev/null +++ b/src/crypto/asm/CryptonightR_template.h @@ -0,0 +1,1087 @@ +// Auto-generated file, do not edit + +extern "C" +{ + void CryptonightWOW_template_part1(); + void CryptonightWOW_template_mainloop(); + void CryptonightWOW_template_part2(); + void CryptonightWOW_template_part3(); + void CryptonightWOW_template_end(); + void CryptonightWOW_template_double_part1(); + void CryptonightWOW_template_double_mainloop(); + void CryptonightWOW_template_double_part2(); + void CryptonightWOW_template_double_part3(); + void CryptonightWOW_template_double_part4(); + void CryptonightWOW_template_double_end(); + + void CryptonightR_template_part1(); + void CryptonightR_template_mainloop(); + void CryptonightR_template_part2(); + void CryptonightR_template_part3(); + void CryptonightR_template_end(); + void CryptonightR_template_double_part1(); + void CryptonightR_template_double_mainloop(); + void CryptonightR_template_double_part2(); + void CryptonightR_template_double_part3(); + void CryptonightR_template_double_part4(); + void CryptonightR_template_double_end(); + + void CryptonightWOW_soft_aes_template_part1(); + void CryptonightWOW_soft_aes_template_mainloop(); + void CryptonightWOW_soft_aes_template_part2(); + void CryptonightWOW_soft_aes_template_part3(); + void CryptonightWOW_soft_aes_template_end(); + void CryptonightWOW_soft_aes_template_double_part1(); + void CryptonightWOW_soft_aes_template_double_mainloop(); + void CryptonightWOW_soft_aes_template_double_part2(); + void CryptonightWOW_soft_aes_template_double_part3(); + void CryptonightWOW_soft_aes_template_double_part4(); + void CryptonightWOW_soft_aes_template_double_end(); + + void CryptonightR_soft_aes_template_part1(); + void CryptonightR_soft_aes_template_mainloop(); + void CryptonightR_soft_aes_template_part2(); + void CryptonightR_soft_aes_template_part3(); + void CryptonightR_soft_aes_template_end(); + void CryptonightR_soft_aes_template_double_part1(); + void CryptonightR_soft_aes_template_double_mainloop(); + void CryptonightR_soft_aes_template_double_part2(); + void CryptonightR_soft_aes_template_double_part3(); + void CryptonightR_soft_aes_template_double_part4(); + void CryptonightR_soft_aes_template_double_end(); + + void CryptonightR_instruction0(); + void CryptonightR_instruction1(); + void CryptonightR_instruction2(); + void CryptonightR_instruction3(); + void CryptonightR_instruction4(); + void CryptonightR_instruction5(); + void CryptonightR_instruction6(); + void CryptonightR_instruction7(); + void CryptonightR_instruction8(); + void CryptonightR_instruction9(); + void CryptonightR_instruction10(); + void CryptonightR_instruction11(); + void CryptonightR_instruction12(); + void CryptonightR_instruction13(); + void CryptonightR_instruction14(); + void CryptonightR_instruction15(); + void CryptonightR_instruction16(); + void CryptonightR_instruction17(); + void CryptonightR_instruction18(); + void CryptonightR_instruction19(); + void CryptonightR_instruction20(); + void CryptonightR_instruction21(); + void CryptonightR_instruction22(); + void CryptonightR_instruction23(); + void CryptonightR_instruction24(); + void CryptonightR_instruction25(); + void CryptonightR_instruction26(); + void CryptonightR_instruction27(); + void CryptonightR_instruction28(); + void CryptonightR_instruction29(); + void CryptonightR_instruction30(); + void CryptonightR_instruction31(); + void CryptonightR_instruction32(); + void CryptonightR_instruction33(); + void CryptonightR_instruction34(); + void CryptonightR_instruction35(); + void CryptonightR_instruction36(); + void CryptonightR_instruction37(); + void CryptonightR_instruction38(); + void CryptonightR_instruction39(); + void CryptonightR_instruction40(); + void CryptonightR_instruction41(); + void CryptonightR_instruction42(); + void CryptonightR_instruction43(); + void CryptonightR_instruction44(); + void CryptonightR_instruction45(); + void CryptonightR_instruction46(); + void CryptonightR_instruction47(); + void CryptonightR_instruction48(); + void CryptonightR_instruction49(); + void CryptonightR_instruction50(); + void CryptonightR_instruction51(); + void CryptonightR_instruction52(); + void CryptonightR_instruction53(); + void CryptonightR_instruction54(); + void CryptonightR_instruction55(); + void CryptonightR_instruction56(); + void CryptonightR_instruction57(); + void CryptonightR_instruction58(); + void CryptonightR_instruction59(); + void CryptonightR_instruction60(); + void CryptonightR_instruction61(); + void CryptonightR_instruction62(); + void CryptonightR_instruction63(); + void CryptonightR_instruction64(); + void CryptonightR_instruction65(); + void CryptonightR_instruction66(); + void CryptonightR_instruction67(); + void CryptonightR_instruction68(); + void CryptonightR_instruction69(); + void CryptonightR_instruction70(); + void CryptonightR_instruction71(); + void CryptonightR_instruction72(); + void CryptonightR_instruction73(); + void CryptonightR_instruction74(); + void CryptonightR_instruction75(); + void CryptonightR_instruction76(); + void CryptonightR_instruction77(); + void CryptonightR_instruction78(); + void CryptonightR_instruction79(); + void CryptonightR_instruction80(); + void CryptonightR_instruction81(); + void CryptonightR_instruction82(); + void CryptonightR_instruction83(); + void CryptonightR_instruction84(); + void CryptonightR_instruction85(); + void CryptonightR_instruction86(); + void CryptonightR_instruction87(); + void CryptonightR_instruction88(); + void CryptonightR_instruction89(); + void CryptonightR_instruction90(); + void CryptonightR_instruction91(); + void CryptonightR_instruction92(); + void CryptonightR_instruction93(); + void CryptonightR_instruction94(); + void CryptonightR_instruction95(); + void CryptonightR_instruction96(); + void CryptonightR_instruction97(); + void CryptonightR_instruction98(); + void CryptonightR_instruction99(); + void CryptonightR_instruction100(); + void CryptonightR_instruction101(); + void CryptonightR_instruction102(); + void CryptonightR_instruction103(); + void CryptonightR_instruction104(); + void CryptonightR_instruction105(); + void CryptonightR_instruction106(); + void CryptonightR_instruction107(); + void CryptonightR_instruction108(); + void CryptonightR_instruction109(); + void CryptonightR_instruction110(); + void CryptonightR_instruction111(); + void CryptonightR_instruction112(); + void CryptonightR_instruction113(); + void CryptonightR_instruction114(); + void CryptonightR_instruction115(); + void CryptonightR_instruction116(); + void CryptonightR_instruction117(); + void CryptonightR_instruction118(); + void CryptonightR_instruction119(); + void CryptonightR_instruction120(); + void CryptonightR_instruction121(); + void CryptonightR_instruction122(); + void CryptonightR_instruction123(); + void CryptonightR_instruction124(); + void CryptonightR_instruction125(); + void CryptonightR_instruction126(); + void CryptonightR_instruction127(); + void CryptonightR_instruction128(); + void CryptonightR_instruction129(); + void CryptonightR_instruction130(); + void CryptonightR_instruction131(); + void CryptonightR_instruction132(); + void CryptonightR_instruction133(); + void CryptonightR_instruction134(); + void CryptonightR_instruction135(); + void CryptonightR_instruction136(); + void CryptonightR_instruction137(); + void CryptonightR_instruction138(); + void CryptonightR_instruction139(); + void CryptonightR_instruction140(); + void CryptonightR_instruction141(); + void CryptonightR_instruction142(); + void CryptonightR_instruction143(); + void CryptonightR_instruction144(); + void CryptonightR_instruction145(); + void CryptonightR_instruction146(); + void CryptonightR_instruction147(); + void CryptonightR_instruction148(); + void CryptonightR_instruction149(); + void CryptonightR_instruction150(); + void CryptonightR_instruction151(); + void CryptonightR_instruction152(); + void CryptonightR_instruction153(); + void CryptonightR_instruction154(); + void CryptonightR_instruction155(); + void CryptonightR_instruction156(); + void CryptonightR_instruction157(); + void CryptonightR_instruction158(); + void CryptonightR_instruction159(); + void CryptonightR_instruction160(); + void CryptonightR_instruction161(); + void CryptonightR_instruction162(); + void CryptonightR_instruction163(); + void CryptonightR_instruction164(); + void CryptonightR_instruction165(); + void CryptonightR_instruction166(); + void CryptonightR_instruction167(); + void CryptonightR_instruction168(); + void CryptonightR_instruction169(); + void CryptonightR_instruction170(); + void CryptonightR_instruction171(); + void CryptonightR_instruction172(); + void CryptonightR_instruction173(); + void CryptonightR_instruction174(); + void CryptonightR_instruction175(); + void CryptonightR_instruction176(); + void CryptonightR_instruction177(); + void CryptonightR_instruction178(); + void CryptonightR_instruction179(); + void CryptonightR_instruction180(); + void CryptonightR_instruction181(); + void CryptonightR_instruction182(); + void CryptonightR_instruction183(); + void CryptonightR_instruction184(); + void CryptonightR_instruction185(); + void CryptonightR_instruction186(); + void CryptonightR_instruction187(); + void CryptonightR_instruction188(); + void CryptonightR_instruction189(); + void CryptonightR_instruction190(); + void CryptonightR_instruction191(); + void CryptonightR_instruction192(); + void CryptonightR_instruction193(); + void CryptonightR_instruction194(); + void CryptonightR_instruction195(); + void CryptonightR_instruction196(); + void CryptonightR_instruction197(); + void CryptonightR_instruction198(); + void CryptonightR_instruction199(); + void CryptonightR_instruction200(); + void CryptonightR_instruction201(); + void CryptonightR_instruction202(); + void CryptonightR_instruction203(); + void CryptonightR_instruction204(); + void CryptonightR_instruction205(); + void CryptonightR_instruction206(); + void CryptonightR_instruction207(); + void CryptonightR_instruction208(); + void CryptonightR_instruction209(); + void CryptonightR_instruction210(); + void CryptonightR_instruction211(); + void CryptonightR_instruction212(); + void CryptonightR_instruction213(); + void CryptonightR_instruction214(); + void CryptonightR_instruction215(); + void CryptonightR_instruction216(); + void CryptonightR_instruction217(); + void CryptonightR_instruction218(); + void CryptonightR_instruction219(); + void CryptonightR_instruction220(); + void CryptonightR_instruction221(); + void CryptonightR_instruction222(); + void CryptonightR_instruction223(); + void CryptonightR_instruction224(); + void CryptonightR_instruction225(); + void CryptonightR_instruction226(); + void CryptonightR_instruction227(); + void CryptonightR_instruction228(); + void CryptonightR_instruction229(); + void CryptonightR_instruction230(); + void CryptonightR_instruction231(); + void CryptonightR_instruction232(); + void CryptonightR_instruction233(); + void CryptonightR_instruction234(); + void CryptonightR_instruction235(); + void CryptonightR_instruction236(); + void CryptonightR_instruction237(); + void CryptonightR_instruction238(); + void CryptonightR_instruction239(); + void CryptonightR_instruction240(); + void CryptonightR_instruction241(); + void CryptonightR_instruction242(); + void CryptonightR_instruction243(); + void CryptonightR_instruction244(); + void CryptonightR_instruction245(); + void CryptonightR_instruction246(); + void CryptonightR_instruction247(); + void CryptonightR_instruction248(); + void CryptonightR_instruction249(); + void CryptonightR_instruction250(); + void CryptonightR_instruction251(); + void CryptonightR_instruction252(); + void CryptonightR_instruction253(); + void CryptonightR_instruction254(); + void CryptonightR_instruction255(); + void CryptonightR_instruction256(); + void CryptonightR_instruction_mov0(); + void CryptonightR_instruction_mov1(); + void CryptonightR_instruction_mov2(); + void CryptonightR_instruction_mov3(); + void CryptonightR_instruction_mov4(); + void CryptonightR_instruction_mov5(); + void CryptonightR_instruction_mov6(); + void CryptonightR_instruction_mov7(); + void CryptonightR_instruction_mov8(); + void CryptonightR_instruction_mov9(); + void CryptonightR_instruction_mov10(); + void CryptonightR_instruction_mov11(); + void CryptonightR_instruction_mov12(); + void CryptonightR_instruction_mov13(); + void CryptonightR_instruction_mov14(); + void CryptonightR_instruction_mov15(); + void CryptonightR_instruction_mov16(); + void CryptonightR_instruction_mov17(); + void CryptonightR_instruction_mov18(); + void CryptonightR_instruction_mov19(); + void CryptonightR_instruction_mov20(); + void CryptonightR_instruction_mov21(); + void CryptonightR_instruction_mov22(); + void CryptonightR_instruction_mov23(); + void CryptonightR_instruction_mov24(); + void CryptonightR_instruction_mov25(); + void CryptonightR_instruction_mov26(); + void CryptonightR_instruction_mov27(); + void CryptonightR_instruction_mov28(); + void CryptonightR_instruction_mov29(); + void CryptonightR_instruction_mov30(); + void CryptonightR_instruction_mov31(); + void CryptonightR_instruction_mov32(); + void CryptonightR_instruction_mov33(); + void CryptonightR_instruction_mov34(); + void CryptonightR_instruction_mov35(); + void CryptonightR_instruction_mov36(); + void CryptonightR_instruction_mov37(); + void CryptonightR_instruction_mov38(); + void CryptonightR_instruction_mov39(); + void CryptonightR_instruction_mov40(); + void CryptonightR_instruction_mov41(); + void CryptonightR_instruction_mov42(); + void CryptonightR_instruction_mov43(); + void CryptonightR_instruction_mov44(); + void CryptonightR_instruction_mov45(); + void CryptonightR_instruction_mov46(); + void CryptonightR_instruction_mov47(); + void CryptonightR_instruction_mov48(); + void CryptonightR_instruction_mov49(); + void CryptonightR_instruction_mov50(); + void CryptonightR_instruction_mov51(); + void CryptonightR_instruction_mov52(); + void CryptonightR_instruction_mov53(); + void CryptonightR_instruction_mov54(); + void CryptonightR_instruction_mov55(); + void CryptonightR_instruction_mov56(); + void CryptonightR_instruction_mov57(); + void CryptonightR_instruction_mov58(); + void CryptonightR_instruction_mov59(); + void CryptonightR_instruction_mov60(); + void CryptonightR_instruction_mov61(); + void CryptonightR_instruction_mov62(); + void CryptonightR_instruction_mov63(); + void CryptonightR_instruction_mov64(); + void CryptonightR_instruction_mov65(); + void CryptonightR_instruction_mov66(); + void CryptonightR_instruction_mov67(); + void CryptonightR_instruction_mov68(); + void CryptonightR_instruction_mov69(); + void CryptonightR_instruction_mov70(); + void CryptonightR_instruction_mov71(); + void CryptonightR_instruction_mov72(); + void CryptonightR_instruction_mov73(); + void CryptonightR_instruction_mov74(); + void CryptonightR_instruction_mov75(); + void CryptonightR_instruction_mov76(); + void CryptonightR_instruction_mov77(); + void CryptonightR_instruction_mov78(); + void CryptonightR_instruction_mov79(); + void CryptonightR_instruction_mov80(); + void CryptonightR_instruction_mov81(); + void CryptonightR_instruction_mov82(); + void CryptonightR_instruction_mov83(); + void CryptonightR_instruction_mov84(); + void CryptonightR_instruction_mov85(); + void CryptonightR_instruction_mov86(); + void CryptonightR_instruction_mov87(); + void CryptonightR_instruction_mov88(); + void CryptonightR_instruction_mov89(); + void CryptonightR_instruction_mov90(); + void CryptonightR_instruction_mov91(); + void CryptonightR_instruction_mov92(); + void CryptonightR_instruction_mov93(); + void CryptonightR_instruction_mov94(); + void CryptonightR_instruction_mov95(); + void CryptonightR_instruction_mov96(); + void CryptonightR_instruction_mov97(); + void CryptonightR_instruction_mov98(); + void CryptonightR_instruction_mov99(); + void CryptonightR_instruction_mov100(); + void CryptonightR_instruction_mov101(); + void CryptonightR_instruction_mov102(); + void CryptonightR_instruction_mov103(); + void CryptonightR_instruction_mov104(); + void CryptonightR_instruction_mov105(); + void CryptonightR_instruction_mov106(); + void CryptonightR_instruction_mov107(); + void CryptonightR_instruction_mov108(); + void CryptonightR_instruction_mov109(); + void CryptonightR_instruction_mov110(); + void CryptonightR_instruction_mov111(); + void CryptonightR_instruction_mov112(); + void CryptonightR_instruction_mov113(); + void CryptonightR_instruction_mov114(); + void CryptonightR_instruction_mov115(); + void CryptonightR_instruction_mov116(); + void CryptonightR_instruction_mov117(); + void CryptonightR_instruction_mov118(); + void CryptonightR_instruction_mov119(); + void CryptonightR_instruction_mov120(); + void CryptonightR_instruction_mov121(); + void CryptonightR_instruction_mov122(); + void CryptonightR_instruction_mov123(); + void CryptonightR_instruction_mov124(); + void CryptonightR_instruction_mov125(); + void CryptonightR_instruction_mov126(); + void CryptonightR_instruction_mov127(); + void CryptonightR_instruction_mov128(); + void CryptonightR_instruction_mov129(); + void CryptonightR_instruction_mov130(); + void CryptonightR_instruction_mov131(); + void CryptonightR_instruction_mov132(); + void CryptonightR_instruction_mov133(); + void CryptonightR_instruction_mov134(); + void CryptonightR_instruction_mov135(); + void CryptonightR_instruction_mov136(); + void CryptonightR_instruction_mov137(); + void CryptonightR_instruction_mov138(); + void CryptonightR_instruction_mov139(); + void CryptonightR_instruction_mov140(); + void CryptonightR_instruction_mov141(); + void CryptonightR_instruction_mov142(); + void CryptonightR_instruction_mov143(); + void CryptonightR_instruction_mov144(); + void CryptonightR_instruction_mov145(); + void CryptonightR_instruction_mov146(); + void CryptonightR_instruction_mov147(); + void CryptonightR_instruction_mov148(); + void CryptonightR_instruction_mov149(); + void CryptonightR_instruction_mov150(); + void CryptonightR_instruction_mov151(); + void CryptonightR_instruction_mov152(); + void CryptonightR_instruction_mov153(); + void CryptonightR_instruction_mov154(); + void CryptonightR_instruction_mov155(); + void CryptonightR_instruction_mov156(); + void CryptonightR_instruction_mov157(); + void CryptonightR_instruction_mov158(); + void CryptonightR_instruction_mov159(); + void CryptonightR_instruction_mov160(); + void CryptonightR_instruction_mov161(); + void CryptonightR_instruction_mov162(); + void CryptonightR_instruction_mov163(); + void CryptonightR_instruction_mov164(); + void CryptonightR_instruction_mov165(); + void CryptonightR_instruction_mov166(); + void CryptonightR_instruction_mov167(); + void CryptonightR_instruction_mov168(); + void CryptonightR_instruction_mov169(); + void CryptonightR_instruction_mov170(); + void CryptonightR_instruction_mov171(); + void CryptonightR_instruction_mov172(); + void CryptonightR_instruction_mov173(); + void CryptonightR_instruction_mov174(); + void CryptonightR_instruction_mov175(); + void CryptonightR_instruction_mov176(); + void CryptonightR_instruction_mov177(); + void CryptonightR_instruction_mov178(); + void CryptonightR_instruction_mov179(); + void CryptonightR_instruction_mov180(); + void CryptonightR_instruction_mov181(); + void CryptonightR_instruction_mov182(); + void CryptonightR_instruction_mov183(); + void CryptonightR_instruction_mov184(); + void CryptonightR_instruction_mov185(); + void CryptonightR_instruction_mov186(); + void CryptonightR_instruction_mov187(); + void CryptonightR_instruction_mov188(); + void CryptonightR_instruction_mov189(); + void CryptonightR_instruction_mov190(); + void CryptonightR_instruction_mov191(); + void CryptonightR_instruction_mov192(); + void CryptonightR_instruction_mov193(); + void CryptonightR_instruction_mov194(); + void CryptonightR_instruction_mov195(); + void CryptonightR_instruction_mov196(); + void CryptonightR_instruction_mov197(); + void CryptonightR_instruction_mov198(); + void CryptonightR_instruction_mov199(); + void CryptonightR_instruction_mov200(); + void CryptonightR_instruction_mov201(); + void CryptonightR_instruction_mov202(); + void CryptonightR_instruction_mov203(); + void CryptonightR_instruction_mov204(); + void CryptonightR_instruction_mov205(); + void CryptonightR_instruction_mov206(); + void CryptonightR_instruction_mov207(); + void CryptonightR_instruction_mov208(); + void CryptonightR_instruction_mov209(); + void CryptonightR_instruction_mov210(); + void CryptonightR_instruction_mov211(); + void CryptonightR_instruction_mov212(); + void CryptonightR_instruction_mov213(); + void CryptonightR_instruction_mov214(); + void CryptonightR_instruction_mov215(); + void CryptonightR_instruction_mov216(); + void CryptonightR_instruction_mov217(); + void CryptonightR_instruction_mov218(); + void CryptonightR_instruction_mov219(); + void CryptonightR_instruction_mov220(); + void CryptonightR_instruction_mov221(); + void CryptonightR_instruction_mov222(); + void CryptonightR_instruction_mov223(); + void CryptonightR_instruction_mov224(); + void CryptonightR_instruction_mov225(); + void CryptonightR_instruction_mov226(); + void CryptonightR_instruction_mov227(); + void CryptonightR_instruction_mov228(); + void CryptonightR_instruction_mov229(); + void CryptonightR_instruction_mov230(); + void CryptonightR_instruction_mov231(); + void CryptonightR_instruction_mov232(); + void CryptonightR_instruction_mov233(); + void CryptonightR_instruction_mov234(); + void CryptonightR_instruction_mov235(); + void CryptonightR_instruction_mov236(); + void CryptonightR_instruction_mov237(); + void CryptonightR_instruction_mov238(); + void CryptonightR_instruction_mov239(); + void CryptonightR_instruction_mov240(); + void CryptonightR_instruction_mov241(); + void CryptonightR_instruction_mov242(); + void CryptonightR_instruction_mov243(); + void CryptonightR_instruction_mov244(); + void CryptonightR_instruction_mov245(); + void CryptonightR_instruction_mov246(); + void CryptonightR_instruction_mov247(); + void CryptonightR_instruction_mov248(); + void CryptonightR_instruction_mov249(); + void CryptonightR_instruction_mov250(); + void CryptonightR_instruction_mov251(); + void CryptonightR_instruction_mov252(); + void CryptonightR_instruction_mov253(); + void CryptonightR_instruction_mov254(); + void CryptonightR_instruction_mov255(); + void CryptonightR_instruction_mov256(); +} + +const void_func instructions[257] = { + CryptonightR_instruction0, + CryptonightR_instruction1, + CryptonightR_instruction2, + CryptonightR_instruction3, + CryptonightR_instruction4, + CryptonightR_instruction5, + CryptonightR_instruction6, + CryptonightR_instruction7, + CryptonightR_instruction8, + CryptonightR_instruction9, + CryptonightR_instruction10, + CryptonightR_instruction11, + CryptonightR_instruction12, + CryptonightR_instruction13, + CryptonightR_instruction14, + CryptonightR_instruction15, + CryptonightR_instruction16, + CryptonightR_instruction17, + CryptonightR_instruction18, + CryptonightR_instruction19, + CryptonightR_instruction20, + CryptonightR_instruction21, + CryptonightR_instruction22, + CryptonightR_instruction23, + CryptonightR_instruction24, + CryptonightR_instruction25, + CryptonightR_instruction26, + CryptonightR_instruction27, + CryptonightR_instruction28, + CryptonightR_instruction29, + CryptonightR_instruction30, + CryptonightR_instruction31, + CryptonightR_instruction32, + CryptonightR_instruction33, + CryptonightR_instruction34, + CryptonightR_instruction35, + CryptonightR_instruction36, + CryptonightR_instruction37, + CryptonightR_instruction38, + CryptonightR_instruction39, + CryptonightR_instruction40, + CryptonightR_instruction41, + CryptonightR_instruction42, + CryptonightR_instruction43, + CryptonightR_instruction44, + CryptonightR_instruction45, + CryptonightR_instruction46, + CryptonightR_instruction47, + CryptonightR_instruction48, + CryptonightR_instruction49, + CryptonightR_instruction50, + CryptonightR_instruction51, + CryptonightR_instruction52, + CryptonightR_instruction53, + CryptonightR_instruction54, + CryptonightR_instruction55, + CryptonightR_instruction56, + CryptonightR_instruction57, + CryptonightR_instruction58, + CryptonightR_instruction59, + CryptonightR_instruction60, + CryptonightR_instruction61, + CryptonightR_instruction62, + CryptonightR_instruction63, + CryptonightR_instruction64, + CryptonightR_instruction65, + CryptonightR_instruction66, + CryptonightR_instruction67, + CryptonightR_instruction68, + CryptonightR_instruction69, + CryptonightR_instruction70, + CryptonightR_instruction71, + CryptonightR_instruction72, + CryptonightR_instruction73, + CryptonightR_instruction74, + CryptonightR_instruction75, + CryptonightR_instruction76, + CryptonightR_instruction77, + CryptonightR_instruction78, + CryptonightR_instruction79, + CryptonightR_instruction80, + CryptonightR_instruction81, + CryptonightR_instruction82, + CryptonightR_instruction83, + CryptonightR_instruction84, + CryptonightR_instruction85, + CryptonightR_instruction86, + CryptonightR_instruction87, + CryptonightR_instruction88, + CryptonightR_instruction89, + CryptonightR_instruction90, + CryptonightR_instruction91, + CryptonightR_instruction92, + CryptonightR_instruction93, + CryptonightR_instruction94, + CryptonightR_instruction95, + CryptonightR_instruction96, + CryptonightR_instruction97, + CryptonightR_instruction98, + CryptonightR_instruction99, + CryptonightR_instruction100, + CryptonightR_instruction101, + CryptonightR_instruction102, + CryptonightR_instruction103, + CryptonightR_instruction104, + CryptonightR_instruction105, + CryptonightR_instruction106, + CryptonightR_instruction107, + CryptonightR_instruction108, + CryptonightR_instruction109, + CryptonightR_instruction110, + CryptonightR_instruction111, + CryptonightR_instruction112, + CryptonightR_instruction113, + CryptonightR_instruction114, + CryptonightR_instruction115, + CryptonightR_instruction116, + CryptonightR_instruction117, + CryptonightR_instruction118, + CryptonightR_instruction119, + CryptonightR_instruction120, + CryptonightR_instruction121, + CryptonightR_instruction122, + CryptonightR_instruction123, + CryptonightR_instruction124, + CryptonightR_instruction125, + CryptonightR_instruction126, + CryptonightR_instruction127, + CryptonightR_instruction128, + CryptonightR_instruction129, + CryptonightR_instruction130, + CryptonightR_instruction131, + CryptonightR_instruction132, + CryptonightR_instruction133, + CryptonightR_instruction134, + CryptonightR_instruction135, + CryptonightR_instruction136, + CryptonightR_instruction137, + CryptonightR_instruction138, + CryptonightR_instruction139, + CryptonightR_instruction140, + CryptonightR_instruction141, + CryptonightR_instruction142, + CryptonightR_instruction143, + CryptonightR_instruction144, + CryptonightR_instruction145, + CryptonightR_instruction146, + CryptonightR_instruction147, + CryptonightR_instruction148, + CryptonightR_instruction149, + CryptonightR_instruction150, + CryptonightR_instruction151, + CryptonightR_instruction152, + CryptonightR_instruction153, + CryptonightR_instruction154, + CryptonightR_instruction155, + CryptonightR_instruction156, + CryptonightR_instruction157, + CryptonightR_instruction158, + CryptonightR_instruction159, + CryptonightR_instruction160, + CryptonightR_instruction161, + CryptonightR_instruction162, + CryptonightR_instruction163, + CryptonightR_instruction164, + CryptonightR_instruction165, + CryptonightR_instruction166, + CryptonightR_instruction167, + CryptonightR_instruction168, + CryptonightR_instruction169, + CryptonightR_instruction170, + CryptonightR_instruction171, + CryptonightR_instruction172, + CryptonightR_instruction173, + CryptonightR_instruction174, + CryptonightR_instruction175, + CryptonightR_instruction176, + CryptonightR_instruction177, + CryptonightR_instruction178, + CryptonightR_instruction179, + CryptonightR_instruction180, + CryptonightR_instruction181, + CryptonightR_instruction182, + CryptonightR_instruction183, + CryptonightR_instruction184, + CryptonightR_instruction185, + CryptonightR_instruction186, + CryptonightR_instruction187, + CryptonightR_instruction188, + CryptonightR_instruction189, + CryptonightR_instruction190, + CryptonightR_instruction191, + CryptonightR_instruction192, + CryptonightR_instruction193, + CryptonightR_instruction194, + CryptonightR_instruction195, + CryptonightR_instruction196, + CryptonightR_instruction197, + CryptonightR_instruction198, + CryptonightR_instruction199, + CryptonightR_instruction200, + CryptonightR_instruction201, + CryptonightR_instruction202, + CryptonightR_instruction203, + CryptonightR_instruction204, + CryptonightR_instruction205, + CryptonightR_instruction206, + CryptonightR_instruction207, + CryptonightR_instruction208, + CryptonightR_instruction209, + CryptonightR_instruction210, + CryptonightR_instruction211, + CryptonightR_instruction212, + CryptonightR_instruction213, + CryptonightR_instruction214, + CryptonightR_instruction215, + CryptonightR_instruction216, + CryptonightR_instruction217, + CryptonightR_instruction218, + CryptonightR_instruction219, + CryptonightR_instruction220, + CryptonightR_instruction221, + CryptonightR_instruction222, + CryptonightR_instruction223, + CryptonightR_instruction224, + CryptonightR_instruction225, + CryptonightR_instruction226, + CryptonightR_instruction227, + CryptonightR_instruction228, + CryptonightR_instruction229, + CryptonightR_instruction230, + CryptonightR_instruction231, + CryptonightR_instruction232, + CryptonightR_instruction233, + CryptonightR_instruction234, + CryptonightR_instruction235, + CryptonightR_instruction236, + CryptonightR_instruction237, + CryptonightR_instruction238, + CryptonightR_instruction239, + CryptonightR_instruction240, + CryptonightR_instruction241, + CryptonightR_instruction242, + CryptonightR_instruction243, + CryptonightR_instruction244, + CryptonightR_instruction245, + CryptonightR_instruction246, + CryptonightR_instruction247, + CryptonightR_instruction248, + CryptonightR_instruction249, + CryptonightR_instruction250, + CryptonightR_instruction251, + CryptonightR_instruction252, + CryptonightR_instruction253, + CryptonightR_instruction254, + CryptonightR_instruction255, + CryptonightR_instruction256, +}; + +const void_func instructions_mov[257] = { + CryptonightR_instruction_mov0, + CryptonightR_instruction_mov1, + CryptonightR_instruction_mov2, + CryptonightR_instruction_mov3, + CryptonightR_instruction_mov4, + CryptonightR_instruction_mov5, + CryptonightR_instruction_mov6, + CryptonightR_instruction_mov7, + CryptonightR_instruction_mov8, + CryptonightR_instruction_mov9, + CryptonightR_instruction_mov10, + CryptonightR_instruction_mov11, + CryptonightR_instruction_mov12, + CryptonightR_instruction_mov13, + CryptonightR_instruction_mov14, + CryptonightR_instruction_mov15, + CryptonightR_instruction_mov16, + CryptonightR_instruction_mov17, + CryptonightR_instruction_mov18, + CryptonightR_instruction_mov19, + CryptonightR_instruction_mov20, + CryptonightR_instruction_mov21, + CryptonightR_instruction_mov22, + CryptonightR_instruction_mov23, + CryptonightR_instruction_mov24, + CryptonightR_instruction_mov25, + CryptonightR_instruction_mov26, + CryptonightR_instruction_mov27, + CryptonightR_instruction_mov28, + CryptonightR_instruction_mov29, + CryptonightR_instruction_mov30, + CryptonightR_instruction_mov31, + CryptonightR_instruction_mov32, + CryptonightR_instruction_mov33, + CryptonightR_instruction_mov34, + CryptonightR_instruction_mov35, + CryptonightR_instruction_mov36, + CryptonightR_instruction_mov37, + CryptonightR_instruction_mov38, + CryptonightR_instruction_mov39, + CryptonightR_instruction_mov40, + CryptonightR_instruction_mov41, + CryptonightR_instruction_mov42, + CryptonightR_instruction_mov43, + CryptonightR_instruction_mov44, + CryptonightR_instruction_mov45, + CryptonightR_instruction_mov46, + CryptonightR_instruction_mov47, + CryptonightR_instruction_mov48, + CryptonightR_instruction_mov49, + CryptonightR_instruction_mov50, + CryptonightR_instruction_mov51, + CryptonightR_instruction_mov52, + CryptonightR_instruction_mov53, + CryptonightR_instruction_mov54, + CryptonightR_instruction_mov55, + CryptonightR_instruction_mov56, + CryptonightR_instruction_mov57, + CryptonightR_instruction_mov58, + CryptonightR_instruction_mov59, + CryptonightR_instruction_mov60, + CryptonightR_instruction_mov61, + CryptonightR_instruction_mov62, + CryptonightR_instruction_mov63, + CryptonightR_instruction_mov64, + CryptonightR_instruction_mov65, + CryptonightR_instruction_mov66, + CryptonightR_instruction_mov67, + CryptonightR_instruction_mov68, + CryptonightR_instruction_mov69, + CryptonightR_instruction_mov70, + CryptonightR_instruction_mov71, + CryptonightR_instruction_mov72, + CryptonightR_instruction_mov73, + CryptonightR_instruction_mov74, + CryptonightR_instruction_mov75, + CryptonightR_instruction_mov76, + CryptonightR_instruction_mov77, + CryptonightR_instruction_mov78, + CryptonightR_instruction_mov79, + CryptonightR_instruction_mov80, + CryptonightR_instruction_mov81, + CryptonightR_instruction_mov82, + CryptonightR_instruction_mov83, + CryptonightR_instruction_mov84, + CryptonightR_instruction_mov85, + CryptonightR_instruction_mov86, + CryptonightR_instruction_mov87, + CryptonightR_instruction_mov88, + CryptonightR_instruction_mov89, + CryptonightR_instruction_mov90, + CryptonightR_instruction_mov91, + CryptonightR_instruction_mov92, + CryptonightR_instruction_mov93, + CryptonightR_instruction_mov94, + CryptonightR_instruction_mov95, + CryptonightR_instruction_mov96, + CryptonightR_instruction_mov97, + CryptonightR_instruction_mov98, + CryptonightR_instruction_mov99, + CryptonightR_instruction_mov100, + CryptonightR_instruction_mov101, + CryptonightR_instruction_mov102, + CryptonightR_instruction_mov103, + CryptonightR_instruction_mov104, + CryptonightR_instruction_mov105, + CryptonightR_instruction_mov106, + CryptonightR_instruction_mov107, + CryptonightR_instruction_mov108, + CryptonightR_instruction_mov109, + CryptonightR_instruction_mov110, + CryptonightR_instruction_mov111, + CryptonightR_instruction_mov112, + CryptonightR_instruction_mov113, + CryptonightR_instruction_mov114, + CryptonightR_instruction_mov115, + CryptonightR_instruction_mov116, + CryptonightR_instruction_mov117, + CryptonightR_instruction_mov118, + CryptonightR_instruction_mov119, + CryptonightR_instruction_mov120, + CryptonightR_instruction_mov121, + CryptonightR_instruction_mov122, + CryptonightR_instruction_mov123, + CryptonightR_instruction_mov124, + CryptonightR_instruction_mov125, + CryptonightR_instruction_mov126, + CryptonightR_instruction_mov127, + CryptonightR_instruction_mov128, + CryptonightR_instruction_mov129, + CryptonightR_instruction_mov130, + CryptonightR_instruction_mov131, + CryptonightR_instruction_mov132, + CryptonightR_instruction_mov133, + CryptonightR_instruction_mov134, + CryptonightR_instruction_mov135, + CryptonightR_instruction_mov136, + CryptonightR_instruction_mov137, + CryptonightR_instruction_mov138, + CryptonightR_instruction_mov139, + CryptonightR_instruction_mov140, + CryptonightR_instruction_mov141, + CryptonightR_instruction_mov142, + CryptonightR_instruction_mov143, + CryptonightR_instruction_mov144, + CryptonightR_instruction_mov145, + CryptonightR_instruction_mov146, + CryptonightR_instruction_mov147, + CryptonightR_instruction_mov148, + CryptonightR_instruction_mov149, + CryptonightR_instruction_mov150, + CryptonightR_instruction_mov151, + CryptonightR_instruction_mov152, + CryptonightR_instruction_mov153, + CryptonightR_instruction_mov154, + CryptonightR_instruction_mov155, + CryptonightR_instruction_mov156, + CryptonightR_instruction_mov157, + CryptonightR_instruction_mov158, + CryptonightR_instruction_mov159, + CryptonightR_instruction_mov160, + CryptonightR_instruction_mov161, + CryptonightR_instruction_mov162, + CryptonightR_instruction_mov163, + CryptonightR_instruction_mov164, + CryptonightR_instruction_mov165, + CryptonightR_instruction_mov166, + CryptonightR_instruction_mov167, + CryptonightR_instruction_mov168, + CryptonightR_instruction_mov169, + CryptonightR_instruction_mov170, + CryptonightR_instruction_mov171, + CryptonightR_instruction_mov172, + CryptonightR_instruction_mov173, + CryptonightR_instruction_mov174, + CryptonightR_instruction_mov175, + CryptonightR_instruction_mov176, + CryptonightR_instruction_mov177, + CryptonightR_instruction_mov178, + CryptonightR_instruction_mov179, + CryptonightR_instruction_mov180, + CryptonightR_instruction_mov181, + CryptonightR_instruction_mov182, + CryptonightR_instruction_mov183, + CryptonightR_instruction_mov184, + CryptonightR_instruction_mov185, + CryptonightR_instruction_mov186, + CryptonightR_instruction_mov187, + CryptonightR_instruction_mov188, + CryptonightR_instruction_mov189, + CryptonightR_instruction_mov190, + CryptonightR_instruction_mov191, + CryptonightR_instruction_mov192, + CryptonightR_instruction_mov193, + CryptonightR_instruction_mov194, + CryptonightR_instruction_mov195, + CryptonightR_instruction_mov196, + CryptonightR_instruction_mov197, + CryptonightR_instruction_mov198, + CryptonightR_instruction_mov199, + CryptonightR_instruction_mov200, + CryptonightR_instruction_mov201, + CryptonightR_instruction_mov202, + CryptonightR_instruction_mov203, + CryptonightR_instruction_mov204, + CryptonightR_instruction_mov205, + CryptonightR_instruction_mov206, + CryptonightR_instruction_mov207, + CryptonightR_instruction_mov208, + CryptonightR_instruction_mov209, + CryptonightR_instruction_mov210, + CryptonightR_instruction_mov211, + CryptonightR_instruction_mov212, + CryptonightR_instruction_mov213, + CryptonightR_instruction_mov214, + CryptonightR_instruction_mov215, + CryptonightR_instruction_mov216, + CryptonightR_instruction_mov217, + CryptonightR_instruction_mov218, + CryptonightR_instruction_mov219, + CryptonightR_instruction_mov220, + CryptonightR_instruction_mov221, + CryptonightR_instruction_mov222, + CryptonightR_instruction_mov223, + CryptonightR_instruction_mov224, + CryptonightR_instruction_mov225, + CryptonightR_instruction_mov226, + CryptonightR_instruction_mov227, + CryptonightR_instruction_mov228, + CryptonightR_instruction_mov229, + CryptonightR_instruction_mov230, + CryptonightR_instruction_mov231, + CryptonightR_instruction_mov232, + CryptonightR_instruction_mov233, + CryptonightR_instruction_mov234, + CryptonightR_instruction_mov235, + CryptonightR_instruction_mov236, + CryptonightR_instruction_mov237, + CryptonightR_instruction_mov238, + CryptonightR_instruction_mov239, + CryptonightR_instruction_mov240, + CryptonightR_instruction_mov241, + CryptonightR_instruction_mov242, + CryptonightR_instruction_mov243, + CryptonightR_instruction_mov244, + CryptonightR_instruction_mov245, + CryptonightR_instruction_mov246, + CryptonightR_instruction_mov247, + CryptonightR_instruction_mov248, + CryptonightR_instruction_mov249, + CryptonightR_instruction_mov250, + CryptonightR_instruction_mov251, + CryptonightR_instruction_mov252, + CryptonightR_instruction_mov253, + CryptonightR_instruction_mov254, + CryptonightR_instruction_mov255, + CryptonightR_instruction_mov256, +}; diff --git a/src/crypto/asm/CryptonightR_template.inc b/src/crypto/asm/CryptonightR_template.inc new file mode 100644 index 00000000..b54486a5 --- /dev/null +++ b/src/crypto/asm/CryptonightR_template.inc @@ -0,0 +1,529 @@ +PUBLIC FN_PREFIX(CryptonightR_template_part1) +PUBLIC FN_PREFIX(CryptonightR_template_mainloop) +PUBLIC FN_PREFIX(CryptonightR_template_part2) +PUBLIC FN_PREFIX(CryptonightR_template_part3) +PUBLIC FN_PREFIX(CryptonightR_template_end) +PUBLIC FN_PREFIX(CryptonightR_template_double_part1) +PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop) +PUBLIC FN_PREFIX(CryptonightR_template_double_part2) +PUBLIC FN_PREFIX(CryptonightR_template_double_part3) +PUBLIC FN_PREFIX(CryptonightR_template_double_part4) +PUBLIC FN_PREFIX(CryptonightR_template_double_end) + +ALIGN(64) +FN_PREFIX(CryptonightR_template_part1): + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movq xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movq xmm0, r12 + movq xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movq xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +FN_PREFIX(CryptonightR_template_mainloop): + movdqa xmm5, XMMWORD PTR [r9+r11] + movq xmm0, r15 + movq xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movq r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +FN_PREFIX(CryptonightR_template_part2): + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov rax, r13 + mul r12 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + movaps xmm3, xmm1 + movdqa xmm2, XMMWORD PTR [r9+r11] + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz FN_PREFIX(CryptonightR_template_mainloop) + +FN_PREFIX(CryptonightR_template_part3): + movq rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +FN_PREFIX(CryptonightR_template_end): + +ALIGN(64) +FN_PREFIX(CryptonightR_template_double_part1): + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movq xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movq xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movq xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movq xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movq xmm0, rcx + mov r11d, 524288 + movq xmm10, rax + punpcklqdq xmm10, xmm0 + + movq xmm14, QWORD PTR [rsp+128] + movq xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +FN_PREFIX(CryptonightR_template_double_mainloop): + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movq xmm0, r12 + mov ecx, ebx + movq xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movq xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movq rdx, xmm6 + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movq xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movq rdi, xmm5 + movq rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movq xmm0, rsp + movq xmm1, rsi + movq xmm2, rdi + movq xmm11, rbp + movq xmm12, r15 + movq xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +FN_PREFIX(CryptonightR_template_double_part2): + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + + movq rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movq rsi, xmm1 + movq rdi, xmm2 + movq rbp, xmm11 + movq r15, xmm12 + movq rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + pxor xmm6, xmm2 + paddq xmm2, xmm3 + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movq rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movq xmm0, rsp + movq xmm1, rbx + movq xmm2, rsi + movq xmm11, rdi + movq xmm12, rbp + movq xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movq xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +FN_PREFIX(CryptonightR_template_double_part3): + + movq r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + + movq rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movq rbx, xmm1 + movq rsi, xmm2 + movq rdi, xmm11 + movq rbp, xmm12 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + mov rdi, rcx + mov r8, rax + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 + xor ebp, 48 + paddq xmm1, xmm8 + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movq rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz FN_PREFIX(CryptonightR_template_double_mainloop) + +FN_PREFIX(CryptonightR_template_double_part4): + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +FN_PREFIX(CryptonightR_template_double_end): diff --git a/src/crypto/asm/CryptonightWOW_soft_aes_template.inc b/src/crypto/asm/CryptonightWOW_soft_aes_template.inc new file mode 100644 index 00000000..feea3949 --- /dev/null +++ b/src/crypto/asm/CryptonightWOW_soft_aes_template.inc @@ -0,0 +1,266 @@ +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part1) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part2) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part3) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_end) + +ALIGN(64) +FN_PREFIX(CryptonightWOW_soft_aes_template_part1): + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 232 + + mov eax, [rcx+96] + mov ebx, [rcx+100] + mov esi, [rcx+104] + mov edx, [rcx+108] + mov [rsp+144], eax + mov [rsp+148], ebx + mov [rsp+152], esi + mov [rsp+156], edx + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+328], rax + movq xmm12, r11 + mov QWORD PTR [rsp+320], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 524288 + + ALIGN(64) +FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop): + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+328] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [r13], xmm0 + + mov ebx, [rsp+144] + mov ebp, [rsp+152] + add ebx, [rsp+148] + add ebp, [rsp+156] + shl rbp, 32 + or rbx, rbp + + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + + mov [rsp+160], rbx + mov [rsp+168], rdi + mov [rsp+176], rbp + mov [rsp+184], r10 + mov r10, rsp + + mov ebx, [rsp+144] + mov esi, [rsp+148] + mov edi, [rsp+152] + mov ebp, [rsp+156] + + movd esp, xmm7 + movaps xmm0, xmm7 + psrldq xmm0, 8 + movd r15d, xmm0 + movd eax, xmm4 + movd edx, xmm5 + +FN_PREFIX(CryptonightWOW_soft_aes_template_part2): + mov rsp, r10 + mov [rsp+144], ebx + mov [rsp+148], esi + mov [rsp+152], edi + mov [rsp+156], ebp + + mov rbx, [rsp+160] + mov rdi, [rsp+168] + mov rbp, [rsp+176] + mov r10, [rsp+184] + + mov r9, r10 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movq xmm0, rax + movq xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+320] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+304] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+320], r9 + mov QWORD PTR [rsp+328], rax + sub r12d, 1 + jne FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop) + +FN_PREFIX(CryptonightWOW_soft_aes_template_part3): + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 232 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +FN_PREFIX(CryptonightWOW_soft_aes_template_end): diff --git a/src/crypto/asm/CryptonightWOW_template.inc b/src/crypto/asm/CryptonightWOW_template.inc new file mode 100644 index 00000000..7183a659 --- /dev/null +++ b/src/crypto/asm/CryptonightWOW_template.inc @@ -0,0 +1,486 @@ +PUBLIC FN_PREFIX(CryptonightWOW_template_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_end) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_end) + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_part1): + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movq xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movq xmm0, r12 + movq xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movq xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_mainloop): + movdqa xmm5, XMMWORD PTR [r9+r11] + movq xmm0, r15 + movq xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + movd r10d, xmm5 + and r10d, 2097136 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + paddq xmm0, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm0 + movq r12, xmm5 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +FN_PREFIX(CryptonightWOW_template_part2): + mov rax, r13 + mul r12 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + xor rdx, QWORD PTR [r12+r11] + xor rax, QWORD PTR [r11+r12+8] + movdqa xmm2, XMMWORD PTR [r9+r11] + pxor xmm3, xmm2 + paddq xmm7, XMMWORD PTR [r10+r11] + paddq xmm1, xmm4 + paddq xmm3, xmm6 + movdqu XMMWORD PTR [r9+r11], xmm7 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [r10+r11], xmm1 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz FN_PREFIX(CryptonightWOW_template_mainloop) + +FN_PREFIX(CryptonightWOW_template_part3): + movq rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +FN_PREFIX(CryptonightWOW_template_end): + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_part1): + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movq xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movq xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movq xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movq xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movq xmm0, rcx + mov r11d, 524288 + movq xmm10, rax + punpcklqdq xmm10, xmm0 + + movq xmm14, QWORD PTR [rsp+128] + movq xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_mainloop): + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movq xmm0, r12 + mov ecx, ebx + movq xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movq rdx, xmm6 + movq xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movq xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movq rdi, xmm5 + movq rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movq xmm0, rsp + movq xmm1, rsi + movq xmm2, rdi + movq xmm11, rbp + movq xmm12, r15 + movq xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +FN_PREFIX(CryptonightWOW_template_double_part2): + + movq rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movq rsi, xmm1 + movq rdi, xmm2 + movq rbp, xmm11 + movq r15, xmm12 + movq rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movq xmm1, rdx + movq xmm0, r8 + punpcklqdq xmm1, xmm0 + pxor xmm1, XMMWORD PTR [rcx+rsi] + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + xor rdx, QWORD PTR [rsi+rcx] + paddq xmm2, xmm3 + xor r8, QWORD PTR [rsi+rcx+8] + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movq rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movq xmm0, rsp + movq xmm1, rbx + movq xmm2, rsi + movq xmm11, rdi + movq xmm12, rbp + movq xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movq xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +FN_PREFIX(CryptonightWOW_template_double_part3): + + movq rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movq rbx, xmm1 + movq rsi, xmm2 + movq rdi, xmm11 + movq rbp, xmm12 + movq r15, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + movq xmm1, rdx + movq xmm0, rax + punpcklqdq xmm1, xmm0 + mov rdi, rcx + mov r8, rax + pxor xmm1, XMMWORD PTR [rbp+rcx] + xor ebp, 48 + paddq xmm1, xmm8 + xor r8, QWORD PTR [rbp+rcx+8] + xor rdx, QWORD PTR [rbp+rcx] + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movq rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz FN_PREFIX(CryptonightWOW_template_double_mainloop) + +FN_PREFIX(CryptonightWOW_template_double_part4): + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +FN_PREFIX(CryptonightWOW_template_double_end): diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index 5dc80bea..26f353a1 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -28,6 +28,19 @@ .global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm) .global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_xcash_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_zelerius_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_zelerius_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_rwz_all_asm) +.global FN_PREFIX(cnv2_double_main_loop_rwz_all_asm) + .global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm) @@ -37,6 +50,8 @@ .global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_zelerius_soft_aes_sandybridge_asm) #ifdef __APPLE__ ALIGN 16 @@ -245,6 +260,129 @@ FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm): add rsp, 48 ret 0 +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_xcash_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_xcash_ivybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_xcash_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_xcash_ryzen.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_xcash_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_xcash_bulldozer.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_double_main_loop_xcash_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cnv2_double_main_loop_xcash_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_zelerius_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_zelerius_ivybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_zelerius_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_zelerius_ryzen.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_zelerius_bulldozer.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cnv2_double_main_loop_zelerius_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_rwz_all_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_rwz_all.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_double_main_loop_rwz_all_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cnv2_double_main_loop_rwz_all.inc" + add rsp, 48 + ret 0 + #ifdef __APPLE__ ALIGN 16 #else @@ -340,4 +478,29 @@ FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm): mov rcx, rdi #include "cnv2_main_loop_ultralite_soft_aes_sandybridge.inc" add rsp, 48 + ret 0 + + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_xcash_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_xcash_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_zelerius_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_zelerius_soft_aes_sandybridge.inc" + add rsp, 48 ret 0 \ No newline at end of file diff --git a/src/crypto/asm/cnv2_double_main_loop_rwz_all.inc b/src/crypto/asm/cnv2_double_main_loop_rwz_all.inc new file mode 100644 index 00000000..d2d87173 --- /dev/null +++ b/src/crypto/asm/cnv2_double_main_loop_rwz_all.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 393216 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN(64) +rwz_main_loop_double: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rax+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm3 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r8+r13], xmm0 + xor r8d, 32 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r11+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js rwz_div_fix_1 +rwz_div_fix_1_ret: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js rwz_div_fix_2 +rwz_div_fix_2_ret: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je rwz_sqrt_fix_1 +rwz_sqrt_fix_1_ret: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je rwz_sqrt_fix_2 +rwz_sqrt_fix_2_ret: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm3, xmm6 + paddq xmm1, xmm11 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm3 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm0 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne rwz_main_loop_double + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp rwz_cnv2_double_mainloop_asm_endp + +rwz_div_fix_1: + dec rbx + add r11, rdx + jmp rwz_div_fix_1_ret + +rwz_div_fix_2: + dec rdx + add r8, r9 + jmp rwz_div_fix_2_ret + +rwz_sqrt_fix_1: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_1_ret + +rwz_sqrt_fix_2: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_2_ret + +rwz_cnv2_double_mainloop_asm_endp: diff --git a/src/crypto/asm/cnv2_main_loop_rwz_all.inc b/src/crypto/asm/cnv2_main_loop_rwz_all.inc new file mode 100644 index 00000000..021f787e --- /dev/null +++ b/src/crypto/asm/cnv2_main_loop_rwz_all.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 393216 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN(64) +rwz_main_loop: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm0, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm2, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + paddq xmm1, xmm7 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movq rdx, xmm3 + test edx, 524287 + je rwz_sqrt_fixup + psrlq xmm3, 19 +rwz_sqrt_fixup_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm4 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm5 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm2 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne rwz_main_loop + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_rwz_main_loop_endp + +rwz_sqrt_fixup: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp rwz_sqrt_fixup_ret + +cnv2_rwz_main_loop_endp: diff --git a/src/crypto/asm/win/CryptonightR_soft_aes_template.inc b/src/crypto/asm/win/CryptonightR_soft_aes_template.inc new file mode 100644 index 00000000..c4a0559b --- /dev/null +++ b/src/crypto/asm/win/CryptonightR_soft_aes_template.inc @@ -0,0 +1,279 @@ +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part1) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_mainloop) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part2) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_part3) +PUBLIC FN_PREFIX(CryptonightR_soft_aes_template_end) + +ALIGN(64) +FN_PREFIX(CryptonightR_soft_aes_template_part1): + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 232 + + mov eax, [rcx+96] + mov ebx, [rcx+100] + mov esi, [rcx+104] + mov edx, [rcx+108] + mov [rsp+144], eax + mov [rsp+148], ebx + mov [rsp+152], esi + mov [rsp+156], edx + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movd xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movd xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movd xmm5, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movd xmm10, QWORD PTR [r10+96] + movd xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+328], rax + movd xmm12, r11 + mov QWORD PTR [rsp+320], r9 + punpcklqdq xmm5, xmm0 + movd xmm13, rcx + mov r12d, 524288 + + ALIGN(64) +FN_PREFIX(CryptonightR_soft_aes_template_mainloop): + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movd xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movd xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+328] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movd r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + pxor xmm6, xmm1 + pxor xmm6, xmm0 + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movd rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movd rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [r13], xmm0 + + mov ebx, [rsp+144] + mov ebp, [rsp+152] + add ebx, [rsp+148] + add ebp, [rsp+156] + shl rbp, 32 + or rbx, rbp + + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + + mov [rsp+160], rbx + mov [rsp+168], rdi + mov [rsp+176], rbp + mov [rsp+184], r10 + mov r10, rsp + + mov ebx, [rsp+144] + mov esi, [rsp+148] + mov edi, [rsp+152] + mov ebp, [rsp+156] + + movd esp, xmm7 + movaps xmm0, xmm7 + psrldq xmm0, 8 + movd r15d, xmm0 + movd eax, xmm4 + movd edx, xmm5 + movaps xmm0, xmm5 + psrldq xmm0, 8 + movd r9d, xmm0 + +FN_PREFIX(CryptonightR_soft_aes_template_part2): + mov rsp, r10 + mov [rsp+144], ebx + mov [rsp+148], esi + mov [rsp+152], edi + mov [rsp+156], ebp + + mov edi, edi + shl rbp, 32 + or rbp, rdi + xor r8, rbp + + mov ebx, ebx + shl rsi, 32 + or rsi, rbx + xor QWORD PTR [rsp+320], rsi + + mov rbx, [rsp+160] + mov rdi, [rsp+168] + mov rbp, [rsp+176] + mov r10, [rsp+184] + + mov r9, r10 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm1 + paddq xmm1, xmm7 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm6, xmm0 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+320] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+304] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+320], r9 + mov QWORD PTR [rsp+328], rax + sub r12d, 1 + jne FN_PREFIX(CryptonightR_soft_aes_template_mainloop) + +FN_PREFIX(CryptonightR_soft_aes_template_part3): + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 232 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +FN_PREFIX(CryptonightR_soft_aes_template_end): diff --git a/src/crypto/asm/win/CryptonightR_soft_aes_template_win.inc b/src/crypto/asm/win/CryptonightR_soft_aes_template_win.inc new file mode 100644 index 00000000..d6d393a9 --- /dev/null +++ b/src/crypto/asm/win/CryptonightR_soft_aes_template_win.inc @@ -0,0 +1,279 @@ +PUBLIC CryptonightR_soft_aes_template_part1 +PUBLIC CryptonightR_soft_aes_template_mainloop +PUBLIC CryptonightR_soft_aes_template_part2 +PUBLIC CryptonightR_soft_aes_template_part3 +PUBLIC CryptonightR_soft_aes_template_end + +ALIGN(64) +CryptonightR_soft_aes_template_part1: + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 232 + + mov eax, [rcx+96] + mov ebx, [rcx+100] + mov esi, [rcx+104] + mov edx, [rcx+108] + mov [rsp+144], eax + mov [rsp+148], ebx + mov [rsp+152], esi + mov [rsp+156], edx + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movd xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movd xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movd xmm5, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movd xmm10, QWORD PTR [r10+96] + movd xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+328], rax + movd xmm12, r11 + mov QWORD PTR [rsp+320], r9 + punpcklqdq xmm5, xmm0 + movd xmm13, rcx + mov r12d, 524288 + + ALIGN(64) +CryptonightR_soft_aes_template_mainloop: + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movd xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movd xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+328] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movd r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + pxor xmm6, xmm1 + pxor xmm6, xmm0 + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movd rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movd rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [r13], xmm0 + + mov ebx, [rsp+144] + mov ebp, [rsp+152] + add ebx, [rsp+148] + add ebp, [rsp+156] + shl rbp, 32 + or rbx, rbp + + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + + mov [rsp+160], rbx + mov [rsp+168], rdi + mov [rsp+176], rbp + mov [rsp+184], r10 + mov r10, rsp + + mov ebx, [rsp+144] + mov esi, [rsp+148] + mov edi, [rsp+152] + mov ebp, [rsp+156] + + movd esp, xmm7 + movaps xmm0, xmm7 + psrldq xmm0, 8 + movd r15d, xmm0 + movd eax, xmm4 + movd edx, xmm5 + movaps xmm0, xmm5 + psrldq xmm0, 8 + movd r9d, xmm0 + +CryptonightR_soft_aes_template_part2: + mov rsp, r10 + mov [rsp+144], ebx + mov [rsp+148], esi + mov [rsp+152], edi + mov [rsp+156], ebp + + mov edi, edi + shl rbp, 32 + or rbp, rdi + xor r8, rbp + + mov ebx, ebx + shl rsi, 32 + or rsi, rbx + xor QWORD PTR [rsp+320], rsi + + mov rbx, [rsp+160] + mov rdi, [rsp+168] + mov rbp, [rsp+176] + mov r10, [rsp+184] + + mov r9, r10 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm2 + pxor xmm6, xmm1 + paddq xmm1, xmm7 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm6, xmm0 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+320] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+304] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+320], r9 + mov QWORD PTR [rsp+328], rax + sub r12d, 1 + jne CryptonightR_soft_aes_template_mainloop + +CryptonightR_soft_aes_template_part3: + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 232 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +CryptonightR_soft_aes_template_end: diff --git a/src/crypto/asm/win/CryptonightR_template.S b/src/crypto/asm/win/CryptonightR_template.S new file mode 100644 index 00000000..d2974d16 --- /dev/null +++ b/src/crypto/asm/win/CryptonightR_template.S @@ -0,0 +1,1595 @@ +#ifdef __APPLE__ +# define ALIGN(x) .align 6 +#else +# define ALIGN(x) .align 64 +#endif +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif + +#define PUBLIC .global + +PUBLIC FN_PREFIX(CryptonightR_instruction0) +PUBLIC FN_PREFIX(CryptonightR_instruction1) +PUBLIC FN_PREFIX(CryptonightR_instruction2) +PUBLIC FN_PREFIX(CryptonightR_instruction3) +PUBLIC FN_PREFIX(CryptonightR_instruction4) +PUBLIC FN_PREFIX(CryptonightR_instruction5) +PUBLIC FN_PREFIX(CryptonightR_instruction6) +PUBLIC FN_PREFIX(CryptonightR_instruction7) +PUBLIC FN_PREFIX(CryptonightR_instruction8) +PUBLIC FN_PREFIX(CryptonightR_instruction9) +PUBLIC FN_PREFIX(CryptonightR_instruction10) +PUBLIC FN_PREFIX(CryptonightR_instruction11) +PUBLIC FN_PREFIX(CryptonightR_instruction12) +PUBLIC FN_PREFIX(CryptonightR_instruction13) +PUBLIC FN_PREFIX(CryptonightR_instruction14) +PUBLIC FN_PREFIX(CryptonightR_instruction15) +PUBLIC FN_PREFIX(CryptonightR_instruction16) +PUBLIC FN_PREFIX(CryptonightR_instruction17) +PUBLIC FN_PREFIX(CryptonightR_instruction18) +PUBLIC FN_PREFIX(CryptonightR_instruction19) +PUBLIC FN_PREFIX(CryptonightR_instruction20) +PUBLIC FN_PREFIX(CryptonightR_instruction21) +PUBLIC FN_PREFIX(CryptonightR_instruction22) +PUBLIC FN_PREFIX(CryptonightR_instruction23) +PUBLIC FN_PREFIX(CryptonightR_instruction24) +PUBLIC FN_PREFIX(CryptonightR_instruction25) +PUBLIC FN_PREFIX(CryptonightR_instruction26) +PUBLIC FN_PREFIX(CryptonightR_instruction27) +PUBLIC FN_PREFIX(CryptonightR_instruction28) +PUBLIC FN_PREFIX(CryptonightR_instruction29) +PUBLIC FN_PREFIX(CryptonightR_instruction30) +PUBLIC FN_PREFIX(CryptonightR_instruction31) +PUBLIC FN_PREFIX(CryptonightR_instruction32) +PUBLIC FN_PREFIX(CryptonightR_instruction33) +PUBLIC FN_PREFIX(CryptonightR_instruction34) +PUBLIC FN_PREFIX(CryptonightR_instruction35) +PUBLIC FN_PREFIX(CryptonightR_instruction36) +PUBLIC FN_PREFIX(CryptonightR_instruction37) +PUBLIC FN_PREFIX(CryptonightR_instruction38) +PUBLIC FN_PREFIX(CryptonightR_instruction39) +PUBLIC FN_PREFIX(CryptonightR_instruction40) +PUBLIC FN_PREFIX(CryptonightR_instruction41) +PUBLIC FN_PREFIX(CryptonightR_instruction42) +PUBLIC FN_PREFIX(CryptonightR_instruction43) +PUBLIC FN_PREFIX(CryptonightR_instruction44) +PUBLIC FN_PREFIX(CryptonightR_instruction45) +PUBLIC FN_PREFIX(CryptonightR_instruction46) +PUBLIC FN_PREFIX(CryptonightR_instruction47) +PUBLIC FN_PREFIX(CryptonightR_instruction48) +PUBLIC FN_PREFIX(CryptonightR_instruction49) +PUBLIC FN_PREFIX(CryptonightR_instruction50) +PUBLIC FN_PREFIX(CryptonightR_instruction51) +PUBLIC FN_PREFIX(CryptonightR_instruction52) +PUBLIC FN_PREFIX(CryptonightR_instruction53) +PUBLIC FN_PREFIX(CryptonightR_instruction54) +PUBLIC FN_PREFIX(CryptonightR_instruction55) +PUBLIC FN_PREFIX(CryptonightR_instruction56) +PUBLIC FN_PREFIX(CryptonightR_instruction57) +PUBLIC FN_PREFIX(CryptonightR_instruction58) +PUBLIC FN_PREFIX(CryptonightR_instruction59) +PUBLIC FN_PREFIX(CryptonightR_instruction60) +PUBLIC FN_PREFIX(CryptonightR_instruction61) +PUBLIC FN_PREFIX(CryptonightR_instruction62) +PUBLIC FN_PREFIX(CryptonightR_instruction63) +PUBLIC FN_PREFIX(CryptonightR_instruction64) +PUBLIC FN_PREFIX(CryptonightR_instruction65) +PUBLIC FN_PREFIX(CryptonightR_instruction66) +PUBLIC FN_PREFIX(CryptonightR_instruction67) +PUBLIC FN_PREFIX(CryptonightR_instruction68) +PUBLIC FN_PREFIX(CryptonightR_instruction69) +PUBLIC FN_PREFIX(CryptonightR_instruction70) +PUBLIC FN_PREFIX(CryptonightR_instruction71) +PUBLIC FN_PREFIX(CryptonightR_instruction72) +PUBLIC FN_PREFIX(CryptonightR_instruction73) +PUBLIC FN_PREFIX(CryptonightR_instruction74) +PUBLIC FN_PREFIX(CryptonightR_instruction75) +PUBLIC FN_PREFIX(CryptonightR_instruction76) +PUBLIC FN_PREFIX(CryptonightR_instruction77) +PUBLIC FN_PREFIX(CryptonightR_instruction78) +PUBLIC FN_PREFIX(CryptonightR_instruction79) +PUBLIC FN_PREFIX(CryptonightR_instruction80) +PUBLIC FN_PREFIX(CryptonightR_instruction81) +PUBLIC FN_PREFIX(CryptonightR_instruction82) +PUBLIC FN_PREFIX(CryptonightR_instruction83) +PUBLIC FN_PREFIX(CryptonightR_instruction84) +PUBLIC FN_PREFIX(CryptonightR_instruction85) +PUBLIC FN_PREFIX(CryptonightR_instruction86) +PUBLIC FN_PREFIX(CryptonightR_instruction87) +PUBLIC FN_PREFIX(CryptonightR_instruction88) +PUBLIC FN_PREFIX(CryptonightR_instruction89) +PUBLIC FN_PREFIX(CryptonightR_instruction90) +PUBLIC FN_PREFIX(CryptonightR_instruction91) +PUBLIC FN_PREFIX(CryptonightR_instruction92) +PUBLIC FN_PREFIX(CryptonightR_instruction93) +PUBLIC FN_PREFIX(CryptonightR_instruction94) +PUBLIC FN_PREFIX(CryptonightR_instruction95) +PUBLIC FN_PREFIX(CryptonightR_instruction96) +PUBLIC FN_PREFIX(CryptonightR_instruction97) +PUBLIC FN_PREFIX(CryptonightR_instruction98) +PUBLIC FN_PREFIX(CryptonightR_instruction99) +PUBLIC FN_PREFIX(CryptonightR_instruction100) +PUBLIC FN_PREFIX(CryptonightR_instruction101) +PUBLIC FN_PREFIX(CryptonightR_instruction102) +PUBLIC FN_PREFIX(CryptonightR_instruction103) +PUBLIC FN_PREFIX(CryptonightR_instruction104) +PUBLIC FN_PREFIX(CryptonightR_instruction105) +PUBLIC FN_PREFIX(CryptonightR_instruction106) +PUBLIC FN_PREFIX(CryptonightR_instruction107) +PUBLIC FN_PREFIX(CryptonightR_instruction108) +PUBLIC FN_PREFIX(CryptonightR_instruction109) +PUBLIC FN_PREFIX(CryptonightR_instruction110) +PUBLIC FN_PREFIX(CryptonightR_instruction111) +PUBLIC FN_PREFIX(CryptonightR_instruction112) +PUBLIC FN_PREFIX(CryptonightR_instruction113) +PUBLIC FN_PREFIX(CryptonightR_instruction114) +PUBLIC FN_PREFIX(CryptonightR_instruction115) +PUBLIC FN_PREFIX(CryptonightR_instruction116) +PUBLIC FN_PREFIX(CryptonightR_instruction117) +PUBLIC FN_PREFIX(CryptonightR_instruction118) +PUBLIC FN_PREFIX(CryptonightR_instruction119) +PUBLIC FN_PREFIX(CryptonightR_instruction120) +PUBLIC FN_PREFIX(CryptonightR_instruction121) +PUBLIC FN_PREFIX(CryptonightR_instruction122) +PUBLIC FN_PREFIX(CryptonightR_instruction123) +PUBLIC FN_PREFIX(CryptonightR_instruction124) +PUBLIC FN_PREFIX(CryptonightR_instruction125) +PUBLIC FN_PREFIX(CryptonightR_instruction126) +PUBLIC FN_PREFIX(CryptonightR_instruction127) +PUBLIC FN_PREFIX(CryptonightR_instruction128) +PUBLIC FN_PREFIX(CryptonightR_instruction129) +PUBLIC FN_PREFIX(CryptonightR_instruction130) +PUBLIC FN_PREFIX(CryptonightR_instruction131) +PUBLIC FN_PREFIX(CryptonightR_instruction132) +PUBLIC FN_PREFIX(CryptonightR_instruction133) +PUBLIC FN_PREFIX(CryptonightR_instruction134) +PUBLIC FN_PREFIX(CryptonightR_instruction135) +PUBLIC FN_PREFIX(CryptonightR_instruction136) +PUBLIC FN_PREFIX(CryptonightR_instruction137) +PUBLIC FN_PREFIX(CryptonightR_instruction138) +PUBLIC FN_PREFIX(CryptonightR_instruction139) +PUBLIC FN_PREFIX(CryptonightR_instruction140) +PUBLIC FN_PREFIX(CryptonightR_instruction141) +PUBLIC FN_PREFIX(CryptonightR_instruction142) +PUBLIC FN_PREFIX(CryptonightR_instruction143) +PUBLIC FN_PREFIX(CryptonightR_instruction144) +PUBLIC FN_PREFIX(CryptonightR_instruction145) +PUBLIC FN_PREFIX(CryptonightR_instruction146) +PUBLIC FN_PREFIX(CryptonightR_instruction147) +PUBLIC FN_PREFIX(CryptonightR_instruction148) +PUBLIC FN_PREFIX(CryptonightR_instruction149) +PUBLIC FN_PREFIX(CryptonightR_instruction150) +PUBLIC FN_PREFIX(CryptonightR_instruction151) +PUBLIC FN_PREFIX(CryptonightR_instruction152) +PUBLIC FN_PREFIX(CryptonightR_instruction153) +PUBLIC FN_PREFIX(CryptonightR_instruction154) +PUBLIC FN_PREFIX(CryptonightR_instruction155) +PUBLIC FN_PREFIX(CryptonightR_instruction156) +PUBLIC FN_PREFIX(CryptonightR_instruction157) +PUBLIC FN_PREFIX(CryptonightR_instruction158) +PUBLIC FN_PREFIX(CryptonightR_instruction159) +PUBLIC FN_PREFIX(CryptonightR_instruction160) +PUBLIC FN_PREFIX(CryptonightR_instruction161) +PUBLIC FN_PREFIX(CryptonightR_instruction162) +PUBLIC FN_PREFIX(CryptonightR_instruction163) +PUBLIC FN_PREFIX(CryptonightR_instruction164) +PUBLIC FN_PREFIX(CryptonightR_instruction165) +PUBLIC FN_PREFIX(CryptonightR_instruction166) +PUBLIC FN_PREFIX(CryptonightR_instruction167) +PUBLIC FN_PREFIX(CryptonightR_instruction168) +PUBLIC FN_PREFIX(CryptonightR_instruction169) +PUBLIC FN_PREFIX(CryptonightR_instruction170) +PUBLIC FN_PREFIX(CryptonightR_instruction171) +PUBLIC FN_PREFIX(CryptonightR_instruction172) +PUBLIC FN_PREFIX(CryptonightR_instruction173) +PUBLIC FN_PREFIX(CryptonightR_instruction174) +PUBLIC FN_PREFIX(CryptonightR_instruction175) +PUBLIC FN_PREFIX(CryptonightR_instruction176) +PUBLIC FN_PREFIX(CryptonightR_instruction177) +PUBLIC FN_PREFIX(CryptonightR_instruction178) +PUBLIC FN_PREFIX(CryptonightR_instruction179) +PUBLIC FN_PREFIX(CryptonightR_instruction180) +PUBLIC FN_PREFIX(CryptonightR_instruction181) +PUBLIC FN_PREFIX(CryptonightR_instruction182) +PUBLIC FN_PREFIX(CryptonightR_instruction183) +PUBLIC FN_PREFIX(CryptonightR_instruction184) +PUBLIC FN_PREFIX(CryptonightR_instruction185) +PUBLIC FN_PREFIX(CryptonightR_instruction186) +PUBLIC FN_PREFIX(CryptonightR_instruction187) +PUBLIC FN_PREFIX(CryptonightR_instruction188) +PUBLIC FN_PREFIX(CryptonightR_instruction189) +PUBLIC FN_PREFIX(CryptonightR_instruction190) +PUBLIC FN_PREFIX(CryptonightR_instruction191) +PUBLIC FN_PREFIX(CryptonightR_instruction192) +PUBLIC FN_PREFIX(CryptonightR_instruction193) +PUBLIC FN_PREFIX(CryptonightR_instruction194) +PUBLIC FN_PREFIX(CryptonightR_instruction195) +PUBLIC FN_PREFIX(CryptonightR_instruction196) +PUBLIC FN_PREFIX(CryptonightR_instruction197) +PUBLIC FN_PREFIX(CryptonightR_instruction198) +PUBLIC FN_PREFIX(CryptonightR_instruction199) +PUBLIC FN_PREFIX(CryptonightR_instruction200) +PUBLIC FN_PREFIX(CryptonightR_instruction201) +PUBLIC FN_PREFIX(CryptonightR_instruction202) +PUBLIC FN_PREFIX(CryptonightR_instruction203) +PUBLIC FN_PREFIX(CryptonightR_instruction204) +PUBLIC FN_PREFIX(CryptonightR_instruction205) +PUBLIC FN_PREFIX(CryptonightR_instruction206) +PUBLIC FN_PREFIX(CryptonightR_instruction207) +PUBLIC FN_PREFIX(CryptonightR_instruction208) +PUBLIC FN_PREFIX(CryptonightR_instruction209) +PUBLIC FN_PREFIX(CryptonightR_instruction210) +PUBLIC FN_PREFIX(CryptonightR_instruction211) +PUBLIC FN_PREFIX(CryptonightR_instruction212) +PUBLIC FN_PREFIX(CryptonightR_instruction213) +PUBLIC FN_PREFIX(CryptonightR_instruction214) +PUBLIC FN_PREFIX(CryptonightR_instruction215) +PUBLIC FN_PREFIX(CryptonightR_instruction216) +PUBLIC FN_PREFIX(CryptonightR_instruction217) +PUBLIC FN_PREFIX(CryptonightR_instruction218) +PUBLIC FN_PREFIX(CryptonightR_instruction219) +PUBLIC FN_PREFIX(CryptonightR_instruction220) +PUBLIC FN_PREFIX(CryptonightR_instruction221) +PUBLIC FN_PREFIX(CryptonightR_instruction222) +PUBLIC FN_PREFIX(CryptonightR_instruction223) +PUBLIC FN_PREFIX(CryptonightR_instruction224) +PUBLIC FN_PREFIX(CryptonightR_instruction225) +PUBLIC FN_PREFIX(CryptonightR_instruction226) +PUBLIC FN_PREFIX(CryptonightR_instruction227) +PUBLIC FN_PREFIX(CryptonightR_instruction228) +PUBLIC FN_PREFIX(CryptonightR_instruction229) +PUBLIC FN_PREFIX(CryptonightR_instruction230) +PUBLIC FN_PREFIX(CryptonightR_instruction231) +PUBLIC FN_PREFIX(CryptonightR_instruction232) +PUBLIC FN_PREFIX(CryptonightR_instruction233) +PUBLIC FN_PREFIX(CryptonightR_instruction234) +PUBLIC FN_PREFIX(CryptonightR_instruction235) +PUBLIC FN_PREFIX(CryptonightR_instruction236) +PUBLIC FN_PREFIX(CryptonightR_instruction237) +PUBLIC FN_PREFIX(CryptonightR_instruction238) +PUBLIC FN_PREFIX(CryptonightR_instruction239) +PUBLIC FN_PREFIX(CryptonightR_instruction240) +PUBLIC FN_PREFIX(CryptonightR_instruction241) +PUBLIC FN_PREFIX(CryptonightR_instruction242) +PUBLIC FN_PREFIX(CryptonightR_instruction243) +PUBLIC FN_PREFIX(CryptonightR_instruction244) +PUBLIC FN_PREFIX(CryptonightR_instruction245) +PUBLIC FN_PREFIX(CryptonightR_instruction246) +PUBLIC FN_PREFIX(CryptonightR_instruction247) +PUBLIC FN_PREFIX(CryptonightR_instruction248) +PUBLIC FN_PREFIX(CryptonightR_instruction249) +PUBLIC FN_PREFIX(CryptonightR_instruction250) +PUBLIC FN_PREFIX(CryptonightR_instruction251) +PUBLIC FN_PREFIX(CryptonightR_instruction252) +PUBLIC FN_PREFIX(CryptonightR_instruction253) +PUBLIC FN_PREFIX(CryptonightR_instruction254) +PUBLIC FN_PREFIX(CryptonightR_instruction255) +PUBLIC FN_PREFIX(CryptonightR_instruction256) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov0) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov1) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov2) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov3) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov4) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov5) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov6) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov7) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov8) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov9) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov10) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov11) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov12) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov13) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov14) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov15) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov16) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov17) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov18) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov19) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov20) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov21) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov22) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov23) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov24) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov25) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov26) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov27) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov28) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov29) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov30) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov31) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov32) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov33) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov34) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov35) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov36) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov37) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov38) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov39) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov40) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov41) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov42) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov43) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov44) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov45) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov46) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov47) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov48) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov49) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov50) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov51) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov52) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov53) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov54) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov55) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov56) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov57) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov58) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov59) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov60) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov61) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov62) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov63) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov64) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov65) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov66) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov67) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov68) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov69) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov70) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov71) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov72) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov73) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov74) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov75) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov76) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov77) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov78) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov79) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov80) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov81) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov82) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov83) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov84) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov85) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov86) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov87) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov88) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov89) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov90) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov91) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov92) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov93) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov94) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov95) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov96) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov97) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov98) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov99) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov100) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov101) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov102) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov103) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov104) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov105) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov106) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov107) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov108) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov109) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov110) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov111) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov112) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov113) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov114) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov115) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov116) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov117) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov118) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov119) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov120) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov121) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov122) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov123) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov124) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov125) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov126) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov127) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov128) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov129) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov130) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov131) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov132) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov133) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov134) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov135) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov136) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov137) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov138) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov139) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov140) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov141) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov142) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov143) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov144) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov145) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov146) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov147) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov148) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov149) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov150) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov151) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov152) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov153) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov154) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov155) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov156) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov157) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov158) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov159) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov160) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov161) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov162) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov163) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov164) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov165) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov166) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov167) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov168) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov169) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov170) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov171) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov172) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov173) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov174) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov175) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov176) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov177) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov178) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov179) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov180) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov181) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov182) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov183) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov184) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov185) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov186) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov187) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov188) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov189) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov190) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov191) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov192) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov193) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov194) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov195) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov196) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov197) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov198) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov199) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov200) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov201) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov202) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov203) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov204) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov205) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov206) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov207) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov208) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov209) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov210) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov211) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov212) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov213) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov214) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov215) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov216) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov217) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov218) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov219) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov220) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov221) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov222) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov223) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov224) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov225) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov226) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov227) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov228) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov229) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov230) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov231) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov232) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov233) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov234) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov235) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov236) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov237) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov238) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov239) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov240) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov241) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov242) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov243) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov244) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov245) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov246) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov247) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov248) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov249) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov250) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov251) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov252) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov253) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov254) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov255) +PUBLIC FN_PREFIX(CryptonightR_instruction_mov256) + +#include "CryptonightWOW_template.inc" +#include "CryptonightR_template.inc" +#include "CryptonightWOW_soft_aes_template.inc" +#include "CryptonightR_soft_aes_template.inc" + +FN_PREFIX(CryptonightR_instruction0): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction1): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction2): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction3): + add rbx, r9 + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction4): + sub rbx, r9 +FN_PREFIX(CryptonightR_instruction5): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction6): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction7): + xor rbx, r9 +FN_PREFIX(CryptonightR_instruction8): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction9): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction10): + imul rsi, rbx +FN_PREFIX(CryptonightR_instruction11): + add rsi, rbx + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction12): + sub rsi, rbx +FN_PREFIX(CryptonightR_instruction13): + ror esi, cl +FN_PREFIX(CryptonightR_instruction14): + rol esi, cl +FN_PREFIX(CryptonightR_instruction15): + xor rsi, rbx +FN_PREFIX(CryptonightR_instruction16): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction17): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction18): + imul rdi, rbx +FN_PREFIX(CryptonightR_instruction19): + add rdi, rbx + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction20): + sub rdi, rbx +FN_PREFIX(CryptonightR_instruction21): + ror edi, cl +FN_PREFIX(CryptonightR_instruction22): + rol edi, cl +FN_PREFIX(CryptonightR_instruction23): + xor rdi, rbx +FN_PREFIX(CryptonightR_instruction24): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction25): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction26): + imul rbp, rbx +FN_PREFIX(CryptonightR_instruction27): + add rbp, rbx + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction28): + sub rbp, rbx +FN_PREFIX(CryptonightR_instruction29): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction30): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction31): + xor rbp, rbx +FN_PREFIX(CryptonightR_instruction32): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction33): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction34): + imul rbx, rsi +FN_PREFIX(CryptonightR_instruction35): + add rbx, rsi + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction36): + sub rbx, rsi +FN_PREFIX(CryptonightR_instruction37): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction38): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction39): + xor rbx, rsi +FN_PREFIX(CryptonightR_instruction40): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction41): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction42): + imul rsi, rsi +FN_PREFIX(CryptonightR_instruction43): + add rsi, r9 + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction44): + sub rsi, r9 +FN_PREFIX(CryptonightR_instruction45): + ror esi, cl +FN_PREFIX(CryptonightR_instruction46): + rol esi, cl +FN_PREFIX(CryptonightR_instruction47): + xor rsi, r9 +FN_PREFIX(CryptonightR_instruction48): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction49): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction50): + imul rdi, rsi +FN_PREFIX(CryptonightR_instruction51): + add rdi, rsi + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction52): + sub rdi, rsi +FN_PREFIX(CryptonightR_instruction53): + ror edi, cl +FN_PREFIX(CryptonightR_instruction54): + rol edi, cl +FN_PREFIX(CryptonightR_instruction55): + xor rdi, rsi +FN_PREFIX(CryptonightR_instruction56): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction57): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction58): + imul rbp, rsi +FN_PREFIX(CryptonightR_instruction59): + add rbp, rsi + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction60): + sub rbp, rsi +FN_PREFIX(CryptonightR_instruction61): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction62): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction63): + xor rbp, rsi +FN_PREFIX(CryptonightR_instruction64): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction65): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction66): + imul rbx, rdi +FN_PREFIX(CryptonightR_instruction67): + add rbx, rdi + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction68): + sub rbx, rdi +FN_PREFIX(CryptonightR_instruction69): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction70): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction71): + xor rbx, rdi +FN_PREFIX(CryptonightR_instruction72): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction73): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction74): + imul rsi, rdi +FN_PREFIX(CryptonightR_instruction75): + add rsi, rdi + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction76): + sub rsi, rdi +FN_PREFIX(CryptonightR_instruction77): + ror esi, cl +FN_PREFIX(CryptonightR_instruction78): + rol esi, cl +FN_PREFIX(CryptonightR_instruction79): + xor rsi, rdi +FN_PREFIX(CryptonightR_instruction80): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction81): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction82): + imul rdi, rdi +FN_PREFIX(CryptonightR_instruction83): + add rdi, r9 + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction84): + sub rdi, r9 +FN_PREFIX(CryptonightR_instruction85): + ror edi, cl +FN_PREFIX(CryptonightR_instruction86): + rol edi, cl +FN_PREFIX(CryptonightR_instruction87): + xor rdi, r9 +FN_PREFIX(CryptonightR_instruction88): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction89): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction90): + imul rbp, rdi +FN_PREFIX(CryptonightR_instruction91): + add rbp, rdi + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction92): + sub rbp, rdi +FN_PREFIX(CryptonightR_instruction93): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction94): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction95): + xor rbp, rdi +FN_PREFIX(CryptonightR_instruction96): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction97): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction98): + imul rbx, rbp +FN_PREFIX(CryptonightR_instruction99): + add rbx, rbp + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction100): + sub rbx, rbp +FN_PREFIX(CryptonightR_instruction101): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction102): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction103): + xor rbx, rbp +FN_PREFIX(CryptonightR_instruction104): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction105): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction106): + imul rsi, rbp +FN_PREFIX(CryptonightR_instruction107): + add rsi, rbp + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction108): + sub rsi, rbp +FN_PREFIX(CryptonightR_instruction109): + ror esi, cl +FN_PREFIX(CryptonightR_instruction110): + rol esi, cl +FN_PREFIX(CryptonightR_instruction111): + xor rsi, rbp +FN_PREFIX(CryptonightR_instruction112): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction113): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction114): + imul rdi, rbp +FN_PREFIX(CryptonightR_instruction115): + add rdi, rbp + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction116): + sub rdi, rbp +FN_PREFIX(CryptonightR_instruction117): + ror edi, cl +FN_PREFIX(CryptonightR_instruction118): + rol edi, cl +FN_PREFIX(CryptonightR_instruction119): + xor rdi, rbp +FN_PREFIX(CryptonightR_instruction120): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction121): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction122): + imul rbp, rbp +FN_PREFIX(CryptonightR_instruction123): + add rbp, r9 + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction124): + sub rbp, r9 +FN_PREFIX(CryptonightR_instruction125): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction126): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction127): + xor rbp, r9 +FN_PREFIX(CryptonightR_instruction128): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction129): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction130): + imul rbx, rsp +FN_PREFIX(CryptonightR_instruction131): + add rbx, rsp + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction132): + sub rbx, rsp +FN_PREFIX(CryptonightR_instruction133): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction134): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction135): + xor rbx, rsp +FN_PREFIX(CryptonightR_instruction136): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction137): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction138): + imul rsi, rsp +FN_PREFIX(CryptonightR_instruction139): + add rsi, rsp + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction140): + sub rsi, rsp +FN_PREFIX(CryptonightR_instruction141): + ror esi, cl +FN_PREFIX(CryptonightR_instruction142): + rol esi, cl +FN_PREFIX(CryptonightR_instruction143): + xor rsi, rsp +FN_PREFIX(CryptonightR_instruction144): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction145): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction146): + imul rdi, rsp +FN_PREFIX(CryptonightR_instruction147): + add rdi, rsp + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction148): + sub rdi, rsp +FN_PREFIX(CryptonightR_instruction149): + ror edi, cl +FN_PREFIX(CryptonightR_instruction150): + rol edi, cl +FN_PREFIX(CryptonightR_instruction151): + xor rdi, rsp +FN_PREFIX(CryptonightR_instruction152): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction153): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction154): + imul rbp, rsp +FN_PREFIX(CryptonightR_instruction155): + add rbp, rsp + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction156): + sub rbp, rsp +FN_PREFIX(CryptonightR_instruction157): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction158): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction159): + xor rbp, rsp +FN_PREFIX(CryptonightR_instruction160): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction161): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction162): + imul rbx, r15 +FN_PREFIX(CryptonightR_instruction163): + add rbx, r15 + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction164): + sub rbx, r15 +FN_PREFIX(CryptonightR_instruction165): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction166): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction167): + xor rbx, r15 +FN_PREFIX(CryptonightR_instruction168): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction169): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction170): + imul rsi, r15 +FN_PREFIX(CryptonightR_instruction171): + add rsi, r15 + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction172): + sub rsi, r15 +FN_PREFIX(CryptonightR_instruction173): + ror esi, cl +FN_PREFIX(CryptonightR_instruction174): + rol esi, cl +FN_PREFIX(CryptonightR_instruction175): + xor rsi, r15 +FN_PREFIX(CryptonightR_instruction176): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction177): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction178): + imul rdi, r15 +FN_PREFIX(CryptonightR_instruction179): + add rdi, r15 + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction180): + sub rdi, r15 +FN_PREFIX(CryptonightR_instruction181): + ror edi, cl +FN_PREFIX(CryptonightR_instruction182): + rol edi, cl +FN_PREFIX(CryptonightR_instruction183): + xor rdi, r15 +FN_PREFIX(CryptonightR_instruction184): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction185): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction186): + imul rbp, r15 +FN_PREFIX(CryptonightR_instruction187): + add rbp, r15 + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction188): + sub rbp, r15 +FN_PREFIX(CryptonightR_instruction189): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction190): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction191): + xor rbp, r15 +FN_PREFIX(CryptonightR_instruction192): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction193): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction194): + imul rbx, rax +FN_PREFIX(CryptonightR_instruction195): + add rbx, rax + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction196): + sub rbx, rax +FN_PREFIX(CryptonightR_instruction197): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction198): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction199): + xor rbx, rax +FN_PREFIX(CryptonightR_instruction200): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction201): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction202): + imul rsi, rax +FN_PREFIX(CryptonightR_instruction203): + add rsi, rax + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction204): + sub rsi, rax +FN_PREFIX(CryptonightR_instruction205): + ror esi, cl +FN_PREFIX(CryptonightR_instruction206): + rol esi, cl +FN_PREFIX(CryptonightR_instruction207): + xor rsi, rax +FN_PREFIX(CryptonightR_instruction208): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction209): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction210): + imul rdi, rax +FN_PREFIX(CryptonightR_instruction211): + add rdi, rax + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction212): + sub rdi, rax +FN_PREFIX(CryptonightR_instruction213): + ror edi, cl +FN_PREFIX(CryptonightR_instruction214): + rol edi, cl +FN_PREFIX(CryptonightR_instruction215): + xor rdi, rax +FN_PREFIX(CryptonightR_instruction216): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction217): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction218): + imul rbp, rax +FN_PREFIX(CryptonightR_instruction219): + add rbp, rax + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction220): + sub rbp, rax +FN_PREFIX(CryptonightR_instruction221): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction222): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction223): + xor rbp, rax +FN_PREFIX(CryptonightR_instruction224): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction225): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction226): + imul rbx, rdx +FN_PREFIX(CryptonightR_instruction227): + add rbx, rdx + add rbx, 2147483647 +FN_PREFIX(CryptonightR_instruction228): + sub rbx, rdx +FN_PREFIX(CryptonightR_instruction229): + ror ebx, cl +FN_PREFIX(CryptonightR_instruction230): + rol ebx, cl +FN_PREFIX(CryptonightR_instruction231): + xor rbx, rdx +FN_PREFIX(CryptonightR_instruction232): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction233): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction234): + imul rsi, rdx +FN_PREFIX(CryptonightR_instruction235): + add rsi, rdx + add rsi, 2147483647 +FN_PREFIX(CryptonightR_instruction236): + sub rsi, rdx +FN_PREFIX(CryptonightR_instruction237): + ror esi, cl +FN_PREFIX(CryptonightR_instruction238): + rol esi, cl +FN_PREFIX(CryptonightR_instruction239): + xor rsi, rdx +FN_PREFIX(CryptonightR_instruction240): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction241): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction242): + imul rdi, rdx +FN_PREFIX(CryptonightR_instruction243): + add rdi, rdx + add rdi, 2147483647 +FN_PREFIX(CryptonightR_instruction244): + sub rdi, rdx +FN_PREFIX(CryptonightR_instruction245): + ror edi, cl +FN_PREFIX(CryptonightR_instruction246): + rol edi, cl +FN_PREFIX(CryptonightR_instruction247): + xor rdi, rdx +FN_PREFIX(CryptonightR_instruction248): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction249): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction250): + imul rbp, rdx +FN_PREFIX(CryptonightR_instruction251): + add rbp, rdx + add rbp, 2147483647 +FN_PREFIX(CryptonightR_instruction252): + sub rbp, rdx +FN_PREFIX(CryptonightR_instruction253): + ror ebp, cl +FN_PREFIX(CryptonightR_instruction254): + rol ebp, cl +FN_PREFIX(CryptonightR_instruction255): + xor rbp, rdx +FN_PREFIX(CryptonightR_instruction256): + imul rbx, rbx +FN_PREFIX(CryptonightR_instruction_mov0): + +FN_PREFIX(CryptonightR_instruction_mov1): + +FN_PREFIX(CryptonightR_instruction_mov2): + +FN_PREFIX(CryptonightR_instruction_mov3): + +FN_PREFIX(CryptonightR_instruction_mov4): + +FN_PREFIX(CryptonightR_instruction_mov5): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov6): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov7): + +FN_PREFIX(CryptonightR_instruction_mov8): + +FN_PREFIX(CryptonightR_instruction_mov9): + +FN_PREFIX(CryptonightR_instruction_mov10): + +FN_PREFIX(CryptonightR_instruction_mov11): + +FN_PREFIX(CryptonightR_instruction_mov12): + +FN_PREFIX(CryptonightR_instruction_mov13): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov14): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov15): + +FN_PREFIX(CryptonightR_instruction_mov16): + +FN_PREFIX(CryptonightR_instruction_mov17): + +FN_PREFIX(CryptonightR_instruction_mov18): + +FN_PREFIX(CryptonightR_instruction_mov19): + +FN_PREFIX(CryptonightR_instruction_mov20): + +FN_PREFIX(CryptonightR_instruction_mov21): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov22): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov23): + +FN_PREFIX(CryptonightR_instruction_mov24): + +FN_PREFIX(CryptonightR_instruction_mov25): + +FN_PREFIX(CryptonightR_instruction_mov26): + +FN_PREFIX(CryptonightR_instruction_mov27): + +FN_PREFIX(CryptonightR_instruction_mov28): + +FN_PREFIX(CryptonightR_instruction_mov29): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov30): + mov rcx, rbx +FN_PREFIX(CryptonightR_instruction_mov31): + +FN_PREFIX(CryptonightR_instruction_mov32): + +FN_PREFIX(CryptonightR_instruction_mov33): + +FN_PREFIX(CryptonightR_instruction_mov34): + +FN_PREFIX(CryptonightR_instruction_mov35): + +FN_PREFIX(CryptonightR_instruction_mov36): + +FN_PREFIX(CryptonightR_instruction_mov37): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov38): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov39): + +FN_PREFIX(CryptonightR_instruction_mov40): + +FN_PREFIX(CryptonightR_instruction_mov41): + +FN_PREFIX(CryptonightR_instruction_mov42): + +FN_PREFIX(CryptonightR_instruction_mov43): + +FN_PREFIX(CryptonightR_instruction_mov44): + +FN_PREFIX(CryptonightR_instruction_mov45): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov46): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov47): + +FN_PREFIX(CryptonightR_instruction_mov48): + +FN_PREFIX(CryptonightR_instruction_mov49): + +FN_PREFIX(CryptonightR_instruction_mov50): + +FN_PREFIX(CryptonightR_instruction_mov51): + +FN_PREFIX(CryptonightR_instruction_mov52): + +FN_PREFIX(CryptonightR_instruction_mov53): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov54): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov55): + +FN_PREFIX(CryptonightR_instruction_mov56): + +FN_PREFIX(CryptonightR_instruction_mov57): + +FN_PREFIX(CryptonightR_instruction_mov58): + +FN_PREFIX(CryptonightR_instruction_mov59): + +FN_PREFIX(CryptonightR_instruction_mov60): + +FN_PREFIX(CryptonightR_instruction_mov61): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov62): + mov rcx, rsi +FN_PREFIX(CryptonightR_instruction_mov63): + +FN_PREFIX(CryptonightR_instruction_mov64): + +FN_PREFIX(CryptonightR_instruction_mov65): + +FN_PREFIX(CryptonightR_instruction_mov66): + +FN_PREFIX(CryptonightR_instruction_mov67): + +FN_PREFIX(CryptonightR_instruction_mov68): + +FN_PREFIX(CryptonightR_instruction_mov69): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov70): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov71): + +FN_PREFIX(CryptonightR_instruction_mov72): + +FN_PREFIX(CryptonightR_instruction_mov73): + +FN_PREFIX(CryptonightR_instruction_mov74): + +FN_PREFIX(CryptonightR_instruction_mov75): + +FN_PREFIX(CryptonightR_instruction_mov76): + +FN_PREFIX(CryptonightR_instruction_mov77): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov78): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov79): + +FN_PREFIX(CryptonightR_instruction_mov80): + +FN_PREFIX(CryptonightR_instruction_mov81): + +FN_PREFIX(CryptonightR_instruction_mov82): + +FN_PREFIX(CryptonightR_instruction_mov83): + +FN_PREFIX(CryptonightR_instruction_mov84): + +FN_PREFIX(CryptonightR_instruction_mov85): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov86): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov87): + +FN_PREFIX(CryptonightR_instruction_mov88): + +FN_PREFIX(CryptonightR_instruction_mov89): + +FN_PREFIX(CryptonightR_instruction_mov90): + +FN_PREFIX(CryptonightR_instruction_mov91): + +FN_PREFIX(CryptonightR_instruction_mov92): + +FN_PREFIX(CryptonightR_instruction_mov93): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov94): + mov rcx, rdi +FN_PREFIX(CryptonightR_instruction_mov95): + +FN_PREFIX(CryptonightR_instruction_mov96): + +FN_PREFIX(CryptonightR_instruction_mov97): + +FN_PREFIX(CryptonightR_instruction_mov98): + +FN_PREFIX(CryptonightR_instruction_mov99): + +FN_PREFIX(CryptonightR_instruction_mov100): + +FN_PREFIX(CryptonightR_instruction_mov101): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov102): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov103): + +FN_PREFIX(CryptonightR_instruction_mov104): + +FN_PREFIX(CryptonightR_instruction_mov105): + +FN_PREFIX(CryptonightR_instruction_mov106): + +FN_PREFIX(CryptonightR_instruction_mov107): + +FN_PREFIX(CryptonightR_instruction_mov108): + +FN_PREFIX(CryptonightR_instruction_mov109): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov110): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov111): + +FN_PREFIX(CryptonightR_instruction_mov112): + +FN_PREFIX(CryptonightR_instruction_mov113): + +FN_PREFIX(CryptonightR_instruction_mov114): + +FN_PREFIX(CryptonightR_instruction_mov115): + +FN_PREFIX(CryptonightR_instruction_mov116): + +FN_PREFIX(CryptonightR_instruction_mov117): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov118): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov119): + +FN_PREFIX(CryptonightR_instruction_mov120): + +FN_PREFIX(CryptonightR_instruction_mov121): + +FN_PREFIX(CryptonightR_instruction_mov122): + +FN_PREFIX(CryptonightR_instruction_mov123): + +FN_PREFIX(CryptonightR_instruction_mov124): + +FN_PREFIX(CryptonightR_instruction_mov125): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov126): + mov rcx, rbp +FN_PREFIX(CryptonightR_instruction_mov127): + +FN_PREFIX(CryptonightR_instruction_mov128): + +FN_PREFIX(CryptonightR_instruction_mov129): + +FN_PREFIX(CryptonightR_instruction_mov130): + +FN_PREFIX(CryptonightR_instruction_mov131): + +FN_PREFIX(CryptonightR_instruction_mov132): + +FN_PREFIX(CryptonightR_instruction_mov133): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov134): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov135): + +FN_PREFIX(CryptonightR_instruction_mov136): + +FN_PREFIX(CryptonightR_instruction_mov137): + +FN_PREFIX(CryptonightR_instruction_mov138): + +FN_PREFIX(CryptonightR_instruction_mov139): + +FN_PREFIX(CryptonightR_instruction_mov140): + +FN_PREFIX(CryptonightR_instruction_mov141): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov142): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov143): + +FN_PREFIX(CryptonightR_instruction_mov144): + +FN_PREFIX(CryptonightR_instruction_mov145): + +FN_PREFIX(CryptonightR_instruction_mov146): + +FN_PREFIX(CryptonightR_instruction_mov147): + +FN_PREFIX(CryptonightR_instruction_mov148): + +FN_PREFIX(CryptonightR_instruction_mov149): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov150): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov151): + +FN_PREFIX(CryptonightR_instruction_mov152): + +FN_PREFIX(CryptonightR_instruction_mov153): + +FN_PREFIX(CryptonightR_instruction_mov154): + +FN_PREFIX(CryptonightR_instruction_mov155): + +FN_PREFIX(CryptonightR_instruction_mov156): + +FN_PREFIX(CryptonightR_instruction_mov157): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov158): + mov rcx, rsp +FN_PREFIX(CryptonightR_instruction_mov159): + +FN_PREFIX(CryptonightR_instruction_mov160): + +FN_PREFIX(CryptonightR_instruction_mov161): + +FN_PREFIX(CryptonightR_instruction_mov162): + +FN_PREFIX(CryptonightR_instruction_mov163): + +FN_PREFIX(CryptonightR_instruction_mov164): + +FN_PREFIX(CryptonightR_instruction_mov165): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov166): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov167): + +FN_PREFIX(CryptonightR_instruction_mov168): + +FN_PREFIX(CryptonightR_instruction_mov169): + +FN_PREFIX(CryptonightR_instruction_mov170): + +FN_PREFIX(CryptonightR_instruction_mov171): + +FN_PREFIX(CryptonightR_instruction_mov172): + +FN_PREFIX(CryptonightR_instruction_mov173): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov174): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov175): + +FN_PREFIX(CryptonightR_instruction_mov176): + +FN_PREFIX(CryptonightR_instruction_mov177): + +FN_PREFIX(CryptonightR_instruction_mov178): + +FN_PREFIX(CryptonightR_instruction_mov179): + +FN_PREFIX(CryptonightR_instruction_mov180): + +FN_PREFIX(CryptonightR_instruction_mov181): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov182): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov183): + +FN_PREFIX(CryptonightR_instruction_mov184): + +FN_PREFIX(CryptonightR_instruction_mov185): + +FN_PREFIX(CryptonightR_instruction_mov186): + +FN_PREFIX(CryptonightR_instruction_mov187): + +FN_PREFIX(CryptonightR_instruction_mov188): + +FN_PREFIX(CryptonightR_instruction_mov189): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov190): + mov rcx, r15 +FN_PREFIX(CryptonightR_instruction_mov191): + +FN_PREFIX(CryptonightR_instruction_mov192): + +FN_PREFIX(CryptonightR_instruction_mov193): + +FN_PREFIX(CryptonightR_instruction_mov194): + +FN_PREFIX(CryptonightR_instruction_mov195): + +FN_PREFIX(CryptonightR_instruction_mov196): + +FN_PREFIX(CryptonightR_instruction_mov197): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov198): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov199): + +FN_PREFIX(CryptonightR_instruction_mov200): + +FN_PREFIX(CryptonightR_instruction_mov201): + +FN_PREFIX(CryptonightR_instruction_mov202): + +FN_PREFIX(CryptonightR_instruction_mov203): + +FN_PREFIX(CryptonightR_instruction_mov204): + +FN_PREFIX(CryptonightR_instruction_mov205): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov206): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov207): + +FN_PREFIX(CryptonightR_instruction_mov208): + +FN_PREFIX(CryptonightR_instruction_mov209): + +FN_PREFIX(CryptonightR_instruction_mov210): + +FN_PREFIX(CryptonightR_instruction_mov211): + +FN_PREFIX(CryptonightR_instruction_mov212): + +FN_PREFIX(CryptonightR_instruction_mov213): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov214): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov215): + +FN_PREFIX(CryptonightR_instruction_mov216): + +FN_PREFIX(CryptonightR_instruction_mov217): + +FN_PREFIX(CryptonightR_instruction_mov218): + +FN_PREFIX(CryptonightR_instruction_mov219): + +FN_PREFIX(CryptonightR_instruction_mov220): + +FN_PREFIX(CryptonightR_instruction_mov221): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov222): + mov rcx, rax +FN_PREFIX(CryptonightR_instruction_mov223): + +FN_PREFIX(CryptonightR_instruction_mov224): + +FN_PREFIX(CryptonightR_instruction_mov225): + +FN_PREFIX(CryptonightR_instruction_mov226): + +FN_PREFIX(CryptonightR_instruction_mov227): + +FN_PREFIX(CryptonightR_instruction_mov228): + +FN_PREFIX(CryptonightR_instruction_mov229): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov230): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov231): + +FN_PREFIX(CryptonightR_instruction_mov232): + +FN_PREFIX(CryptonightR_instruction_mov233): + +FN_PREFIX(CryptonightR_instruction_mov234): + +FN_PREFIX(CryptonightR_instruction_mov235): + +FN_PREFIX(CryptonightR_instruction_mov236): + +FN_PREFIX(CryptonightR_instruction_mov237): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov238): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov239): + +FN_PREFIX(CryptonightR_instruction_mov240): + +FN_PREFIX(CryptonightR_instruction_mov241): + +FN_PREFIX(CryptonightR_instruction_mov242): + +FN_PREFIX(CryptonightR_instruction_mov243): + +FN_PREFIX(CryptonightR_instruction_mov244): + +FN_PREFIX(CryptonightR_instruction_mov245): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov246): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov247): + +FN_PREFIX(CryptonightR_instruction_mov248): + +FN_PREFIX(CryptonightR_instruction_mov249): + +FN_PREFIX(CryptonightR_instruction_mov250): + +FN_PREFIX(CryptonightR_instruction_mov251): + +FN_PREFIX(CryptonightR_instruction_mov252): + +FN_PREFIX(CryptonightR_instruction_mov253): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov254): + mov rcx, rdx +FN_PREFIX(CryptonightR_instruction_mov255): + +FN_PREFIX(CryptonightR_instruction_mov256): diff --git a/src/crypto/asm/win/CryptonightR_template.asm b/src/crypto/asm/win/CryptonightR_template.asm new file mode 100644 index 00000000..250eca3d --- /dev/null +++ b/src/crypto/asm/win/CryptonightR_template.asm @@ -0,0 +1,1585 @@ +; Auto-generated file, do not edit + +_TEXT_CN_TEMPLATE SEGMENT PAGE READ EXECUTE +PUBLIC CryptonightR_instruction0 +PUBLIC CryptonightR_instruction1 +PUBLIC CryptonightR_instruction2 +PUBLIC CryptonightR_instruction3 +PUBLIC CryptonightR_instruction4 +PUBLIC CryptonightR_instruction5 +PUBLIC CryptonightR_instruction6 +PUBLIC CryptonightR_instruction7 +PUBLIC CryptonightR_instruction8 +PUBLIC CryptonightR_instruction9 +PUBLIC CryptonightR_instruction10 +PUBLIC CryptonightR_instruction11 +PUBLIC CryptonightR_instruction12 +PUBLIC CryptonightR_instruction13 +PUBLIC CryptonightR_instruction14 +PUBLIC CryptonightR_instruction15 +PUBLIC CryptonightR_instruction16 +PUBLIC CryptonightR_instruction17 +PUBLIC CryptonightR_instruction18 +PUBLIC CryptonightR_instruction19 +PUBLIC CryptonightR_instruction20 +PUBLIC CryptonightR_instruction21 +PUBLIC CryptonightR_instruction22 +PUBLIC CryptonightR_instruction23 +PUBLIC CryptonightR_instruction24 +PUBLIC CryptonightR_instruction25 +PUBLIC CryptonightR_instruction26 +PUBLIC CryptonightR_instruction27 +PUBLIC CryptonightR_instruction28 +PUBLIC CryptonightR_instruction29 +PUBLIC CryptonightR_instruction30 +PUBLIC CryptonightR_instruction31 +PUBLIC CryptonightR_instruction32 +PUBLIC CryptonightR_instruction33 +PUBLIC CryptonightR_instruction34 +PUBLIC CryptonightR_instruction35 +PUBLIC CryptonightR_instruction36 +PUBLIC CryptonightR_instruction37 +PUBLIC CryptonightR_instruction38 +PUBLIC CryptonightR_instruction39 +PUBLIC CryptonightR_instruction40 +PUBLIC CryptonightR_instruction41 +PUBLIC CryptonightR_instruction42 +PUBLIC CryptonightR_instruction43 +PUBLIC CryptonightR_instruction44 +PUBLIC CryptonightR_instruction45 +PUBLIC CryptonightR_instruction46 +PUBLIC CryptonightR_instruction47 +PUBLIC CryptonightR_instruction48 +PUBLIC CryptonightR_instruction49 +PUBLIC CryptonightR_instruction50 +PUBLIC CryptonightR_instruction51 +PUBLIC CryptonightR_instruction52 +PUBLIC CryptonightR_instruction53 +PUBLIC CryptonightR_instruction54 +PUBLIC CryptonightR_instruction55 +PUBLIC CryptonightR_instruction56 +PUBLIC CryptonightR_instruction57 +PUBLIC CryptonightR_instruction58 +PUBLIC CryptonightR_instruction59 +PUBLIC CryptonightR_instruction60 +PUBLIC CryptonightR_instruction61 +PUBLIC CryptonightR_instruction62 +PUBLIC CryptonightR_instruction63 +PUBLIC CryptonightR_instruction64 +PUBLIC CryptonightR_instruction65 +PUBLIC CryptonightR_instruction66 +PUBLIC CryptonightR_instruction67 +PUBLIC CryptonightR_instruction68 +PUBLIC CryptonightR_instruction69 +PUBLIC CryptonightR_instruction70 +PUBLIC CryptonightR_instruction71 +PUBLIC CryptonightR_instruction72 +PUBLIC CryptonightR_instruction73 +PUBLIC CryptonightR_instruction74 +PUBLIC CryptonightR_instruction75 +PUBLIC CryptonightR_instruction76 +PUBLIC CryptonightR_instruction77 +PUBLIC CryptonightR_instruction78 +PUBLIC CryptonightR_instruction79 +PUBLIC CryptonightR_instruction80 +PUBLIC CryptonightR_instruction81 +PUBLIC CryptonightR_instruction82 +PUBLIC CryptonightR_instruction83 +PUBLIC CryptonightR_instruction84 +PUBLIC CryptonightR_instruction85 +PUBLIC CryptonightR_instruction86 +PUBLIC CryptonightR_instruction87 +PUBLIC CryptonightR_instruction88 +PUBLIC CryptonightR_instruction89 +PUBLIC CryptonightR_instruction90 +PUBLIC CryptonightR_instruction91 +PUBLIC CryptonightR_instruction92 +PUBLIC CryptonightR_instruction93 +PUBLIC CryptonightR_instruction94 +PUBLIC CryptonightR_instruction95 +PUBLIC CryptonightR_instruction96 +PUBLIC CryptonightR_instruction97 +PUBLIC CryptonightR_instruction98 +PUBLIC CryptonightR_instruction99 +PUBLIC CryptonightR_instruction100 +PUBLIC CryptonightR_instruction101 +PUBLIC CryptonightR_instruction102 +PUBLIC CryptonightR_instruction103 +PUBLIC CryptonightR_instruction104 +PUBLIC CryptonightR_instruction105 +PUBLIC CryptonightR_instruction106 +PUBLIC CryptonightR_instruction107 +PUBLIC CryptonightR_instruction108 +PUBLIC CryptonightR_instruction109 +PUBLIC CryptonightR_instruction110 +PUBLIC CryptonightR_instruction111 +PUBLIC CryptonightR_instruction112 +PUBLIC CryptonightR_instruction113 +PUBLIC CryptonightR_instruction114 +PUBLIC CryptonightR_instruction115 +PUBLIC CryptonightR_instruction116 +PUBLIC CryptonightR_instruction117 +PUBLIC CryptonightR_instruction118 +PUBLIC CryptonightR_instruction119 +PUBLIC CryptonightR_instruction120 +PUBLIC CryptonightR_instruction121 +PUBLIC CryptonightR_instruction122 +PUBLIC CryptonightR_instruction123 +PUBLIC CryptonightR_instruction124 +PUBLIC CryptonightR_instruction125 +PUBLIC CryptonightR_instruction126 +PUBLIC CryptonightR_instruction127 +PUBLIC CryptonightR_instruction128 +PUBLIC CryptonightR_instruction129 +PUBLIC CryptonightR_instruction130 +PUBLIC CryptonightR_instruction131 +PUBLIC CryptonightR_instruction132 +PUBLIC CryptonightR_instruction133 +PUBLIC CryptonightR_instruction134 +PUBLIC CryptonightR_instruction135 +PUBLIC CryptonightR_instruction136 +PUBLIC CryptonightR_instruction137 +PUBLIC CryptonightR_instruction138 +PUBLIC CryptonightR_instruction139 +PUBLIC CryptonightR_instruction140 +PUBLIC CryptonightR_instruction141 +PUBLIC CryptonightR_instruction142 +PUBLIC CryptonightR_instruction143 +PUBLIC CryptonightR_instruction144 +PUBLIC CryptonightR_instruction145 +PUBLIC CryptonightR_instruction146 +PUBLIC CryptonightR_instruction147 +PUBLIC CryptonightR_instruction148 +PUBLIC CryptonightR_instruction149 +PUBLIC CryptonightR_instruction150 +PUBLIC CryptonightR_instruction151 +PUBLIC CryptonightR_instruction152 +PUBLIC CryptonightR_instruction153 +PUBLIC CryptonightR_instruction154 +PUBLIC CryptonightR_instruction155 +PUBLIC CryptonightR_instruction156 +PUBLIC CryptonightR_instruction157 +PUBLIC CryptonightR_instruction158 +PUBLIC CryptonightR_instruction159 +PUBLIC CryptonightR_instruction160 +PUBLIC CryptonightR_instruction161 +PUBLIC CryptonightR_instruction162 +PUBLIC CryptonightR_instruction163 +PUBLIC CryptonightR_instruction164 +PUBLIC CryptonightR_instruction165 +PUBLIC CryptonightR_instruction166 +PUBLIC CryptonightR_instruction167 +PUBLIC CryptonightR_instruction168 +PUBLIC CryptonightR_instruction169 +PUBLIC CryptonightR_instruction170 +PUBLIC CryptonightR_instruction171 +PUBLIC CryptonightR_instruction172 +PUBLIC CryptonightR_instruction173 +PUBLIC CryptonightR_instruction174 +PUBLIC CryptonightR_instruction175 +PUBLIC CryptonightR_instruction176 +PUBLIC CryptonightR_instruction177 +PUBLIC CryptonightR_instruction178 +PUBLIC CryptonightR_instruction179 +PUBLIC CryptonightR_instruction180 +PUBLIC CryptonightR_instruction181 +PUBLIC CryptonightR_instruction182 +PUBLIC CryptonightR_instruction183 +PUBLIC CryptonightR_instruction184 +PUBLIC CryptonightR_instruction185 +PUBLIC CryptonightR_instruction186 +PUBLIC CryptonightR_instruction187 +PUBLIC CryptonightR_instruction188 +PUBLIC CryptonightR_instruction189 +PUBLIC CryptonightR_instruction190 +PUBLIC CryptonightR_instruction191 +PUBLIC CryptonightR_instruction192 +PUBLIC CryptonightR_instruction193 +PUBLIC CryptonightR_instruction194 +PUBLIC CryptonightR_instruction195 +PUBLIC CryptonightR_instruction196 +PUBLIC CryptonightR_instruction197 +PUBLIC CryptonightR_instruction198 +PUBLIC CryptonightR_instruction199 +PUBLIC CryptonightR_instruction200 +PUBLIC CryptonightR_instruction201 +PUBLIC CryptonightR_instruction202 +PUBLIC CryptonightR_instruction203 +PUBLIC CryptonightR_instruction204 +PUBLIC CryptonightR_instruction205 +PUBLIC CryptonightR_instruction206 +PUBLIC CryptonightR_instruction207 +PUBLIC CryptonightR_instruction208 +PUBLIC CryptonightR_instruction209 +PUBLIC CryptonightR_instruction210 +PUBLIC CryptonightR_instruction211 +PUBLIC CryptonightR_instruction212 +PUBLIC CryptonightR_instruction213 +PUBLIC CryptonightR_instruction214 +PUBLIC CryptonightR_instruction215 +PUBLIC CryptonightR_instruction216 +PUBLIC CryptonightR_instruction217 +PUBLIC CryptonightR_instruction218 +PUBLIC CryptonightR_instruction219 +PUBLIC CryptonightR_instruction220 +PUBLIC CryptonightR_instruction221 +PUBLIC CryptonightR_instruction222 +PUBLIC CryptonightR_instruction223 +PUBLIC CryptonightR_instruction224 +PUBLIC CryptonightR_instruction225 +PUBLIC CryptonightR_instruction226 +PUBLIC CryptonightR_instruction227 +PUBLIC CryptonightR_instruction228 +PUBLIC CryptonightR_instruction229 +PUBLIC CryptonightR_instruction230 +PUBLIC CryptonightR_instruction231 +PUBLIC CryptonightR_instruction232 +PUBLIC CryptonightR_instruction233 +PUBLIC CryptonightR_instruction234 +PUBLIC CryptonightR_instruction235 +PUBLIC CryptonightR_instruction236 +PUBLIC CryptonightR_instruction237 +PUBLIC CryptonightR_instruction238 +PUBLIC CryptonightR_instruction239 +PUBLIC CryptonightR_instruction240 +PUBLIC CryptonightR_instruction241 +PUBLIC CryptonightR_instruction242 +PUBLIC CryptonightR_instruction243 +PUBLIC CryptonightR_instruction244 +PUBLIC CryptonightR_instruction245 +PUBLIC CryptonightR_instruction246 +PUBLIC CryptonightR_instruction247 +PUBLIC CryptonightR_instruction248 +PUBLIC CryptonightR_instruction249 +PUBLIC CryptonightR_instruction250 +PUBLIC CryptonightR_instruction251 +PUBLIC CryptonightR_instruction252 +PUBLIC CryptonightR_instruction253 +PUBLIC CryptonightR_instruction254 +PUBLIC CryptonightR_instruction255 +PUBLIC CryptonightR_instruction256 +PUBLIC CryptonightR_instruction_mov0 +PUBLIC CryptonightR_instruction_mov1 +PUBLIC CryptonightR_instruction_mov2 +PUBLIC CryptonightR_instruction_mov3 +PUBLIC CryptonightR_instruction_mov4 +PUBLIC CryptonightR_instruction_mov5 +PUBLIC CryptonightR_instruction_mov6 +PUBLIC CryptonightR_instruction_mov7 +PUBLIC CryptonightR_instruction_mov8 +PUBLIC CryptonightR_instruction_mov9 +PUBLIC CryptonightR_instruction_mov10 +PUBLIC CryptonightR_instruction_mov11 +PUBLIC CryptonightR_instruction_mov12 +PUBLIC CryptonightR_instruction_mov13 +PUBLIC CryptonightR_instruction_mov14 +PUBLIC CryptonightR_instruction_mov15 +PUBLIC CryptonightR_instruction_mov16 +PUBLIC CryptonightR_instruction_mov17 +PUBLIC CryptonightR_instruction_mov18 +PUBLIC CryptonightR_instruction_mov19 +PUBLIC CryptonightR_instruction_mov20 +PUBLIC CryptonightR_instruction_mov21 +PUBLIC CryptonightR_instruction_mov22 +PUBLIC CryptonightR_instruction_mov23 +PUBLIC CryptonightR_instruction_mov24 +PUBLIC CryptonightR_instruction_mov25 +PUBLIC CryptonightR_instruction_mov26 +PUBLIC CryptonightR_instruction_mov27 +PUBLIC CryptonightR_instruction_mov28 +PUBLIC CryptonightR_instruction_mov29 +PUBLIC CryptonightR_instruction_mov30 +PUBLIC CryptonightR_instruction_mov31 +PUBLIC CryptonightR_instruction_mov32 +PUBLIC CryptonightR_instruction_mov33 +PUBLIC CryptonightR_instruction_mov34 +PUBLIC CryptonightR_instruction_mov35 +PUBLIC CryptonightR_instruction_mov36 +PUBLIC CryptonightR_instruction_mov37 +PUBLIC CryptonightR_instruction_mov38 +PUBLIC CryptonightR_instruction_mov39 +PUBLIC CryptonightR_instruction_mov40 +PUBLIC CryptonightR_instruction_mov41 +PUBLIC CryptonightR_instruction_mov42 +PUBLIC CryptonightR_instruction_mov43 +PUBLIC CryptonightR_instruction_mov44 +PUBLIC CryptonightR_instruction_mov45 +PUBLIC CryptonightR_instruction_mov46 +PUBLIC CryptonightR_instruction_mov47 +PUBLIC CryptonightR_instruction_mov48 +PUBLIC CryptonightR_instruction_mov49 +PUBLIC CryptonightR_instruction_mov50 +PUBLIC CryptonightR_instruction_mov51 +PUBLIC CryptonightR_instruction_mov52 +PUBLIC CryptonightR_instruction_mov53 +PUBLIC CryptonightR_instruction_mov54 +PUBLIC CryptonightR_instruction_mov55 +PUBLIC CryptonightR_instruction_mov56 +PUBLIC CryptonightR_instruction_mov57 +PUBLIC CryptonightR_instruction_mov58 +PUBLIC CryptonightR_instruction_mov59 +PUBLIC CryptonightR_instruction_mov60 +PUBLIC CryptonightR_instruction_mov61 +PUBLIC CryptonightR_instruction_mov62 +PUBLIC CryptonightR_instruction_mov63 +PUBLIC CryptonightR_instruction_mov64 +PUBLIC CryptonightR_instruction_mov65 +PUBLIC CryptonightR_instruction_mov66 +PUBLIC CryptonightR_instruction_mov67 +PUBLIC CryptonightR_instruction_mov68 +PUBLIC CryptonightR_instruction_mov69 +PUBLIC CryptonightR_instruction_mov70 +PUBLIC CryptonightR_instruction_mov71 +PUBLIC CryptonightR_instruction_mov72 +PUBLIC CryptonightR_instruction_mov73 +PUBLIC CryptonightR_instruction_mov74 +PUBLIC CryptonightR_instruction_mov75 +PUBLIC CryptonightR_instruction_mov76 +PUBLIC CryptonightR_instruction_mov77 +PUBLIC CryptonightR_instruction_mov78 +PUBLIC CryptonightR_instruction_mov79 +PUBLIC CryptonightR_instruction_mov80 +PUBLIC CryptonightR_instruction_mov81 +PUBLIC CryptonightR_instruction_mov82 +PUBLIC CryptonightR_instruction_mov83 +PUBLIC CryptonightR_instruction_mov84 +PUBLIC CryptonightR_instruction_mov85 +PUBLIC CryptonightR_instruction_mov86 +PUBLIC CryptonightR_instruction_mov87 +PUBLIC CryptonightR_instruction_mov88 +PUBLIC CryptonightR_instruction_mov89 +PUBLIC CryptonightR_instruction_mov90 +PUBLIC CryptonightR_instruction_mov91 +PUBLIC CryptonightR_instruction_mov92 +PUBLIC CryptonightR_instruction_mov93 +PUBLIC CryptonightR_instruction_mov94 +PUBLIC CryptonightR_instruction_mov95 +PUBLIC CryptonightR_instruction_mov96 +PUBLIC CryptonightR_instruction_mov97 +PUBLIC CryptonightR_instruction_mov98 +PUBLIC CryptonightR_instruction_mov99 +PUBLIC CryptonightR_instruction_mov100 +PUBLIC CryptonightR_instruction_mov101 +PUBLIC CryptonightR_instruction_mov102 +PUBLIC CryptonightR_instruction_mov103 +PUBLIC CryptonightR_instruction_mov104 +PUBLIC CryptonightR_instruction_mov105 +PUBLIC CryptonightR_instruction_mov106 +PUBLIC CryptonightR_instruction_mov107 +PUBLIC CryptonightR_instruction_mov108 +PUBLIC CryptonightR_instruction_mov109 +PUBLIC CryptonightR_instruction_mov110 +PUBLIC CryptonightR_instruction_mov111 +PUBLIC CryptonightR_instruction_mov112 +PUBLIC CryptonightR_instruction_mov113 +PUBLIC CryptonightR_instruction_mov114 +PUBLIC CryptonightR_instruction_mov115 +PUBLIC CryptonightR_instruction_mov116 +PUBLIC CryptonightR_instruction_mov117 +PUBLIC CryptonightR_instruction_mov118 +PUBLIC CryptonightR_instruction_mov119 +PUBLIC CryptonightR_instruction_mov120 +PUBLIC CryptonightR_instruction_mov121 +PUBLIC CryptonightR_instruction_mov122 +PUBLIC CryptonightR_instruction_mov123 +PUBLIC CryptonightR_instruction_mov124 +PUBLIC CryptonightR_instruction_mov125 +PUBLIC CryptonightR_instruction_mov126 +PUBLIC CryptonightR_instruction_mov127 +PUBLIC CryptonightR_instruction_mov128 +PUBLIC CryptonightR_instruction_mov129 +PUBLIC CryptonightR_instruction_mov130 +PUBLIC CryptonightR_instruction_mov131 +PUBLIC CryptonightR_instruction_mov132 +PUBLIC CryptonightR_instruction_mov133 +PUBLIC CryptonightR_instruction_mov134 +PUBLIC CryptonightR_instruction_mov135 +PUBLIC CryptonightR_instruction_mov136 +PUBLIC CryptonightR_instruction_mov137 +PUBLIC CryptonightR_instruction_mov138 +PUBLIC CryptonightR_instruction_mov139 +PUBLIC CryptonightR_instruction_mov140 +PUBLIC CryptonightR_instruction_mov141 +PUBLIC CryptonightR_instruction_mov142 +PUBLIC CryptonightR_instruction_mov143 +PUBLIC CryptonightR_instruction_mov144 +PUBLIC CryptonightR_instruction_mov145 +PUBLIC CryptonightR_instruction_mov146 +PUBLIC CryptonightR_instruction_mov147 +PUBLIC CryptonightR_instruction_mov148 +PUBLIC CryptonightR_instruction_mov149 +PUBLIC CryptonightR_instruction_mov150 +PUBLIC CryptonightR_instruction_mov151 +PUBLIC CryptonightR_instruction_mov152 +PUBLIC CryptonightR_instruction_mov153 +PUBLIC CryptonightR_instruction_mov154 +PUBLIC CryptonightR_instruction_mov155 +PUBLIC CryptonightR_instruction_mov156 +PUBLIC CryptonightR_instruction_mov157 +PUBLIC CryptonightR_instruction_mov158 +PUBLIC CryptonightR_instruction_mov159 +PUBLIC CryptonightR_instruction_mov160 +PUBLIC CryptonightR_instruction_mov161 +PUBLIC CryptonightR_instruction_mov162 +PUBLIC CryptonightR_instruction_mov163 +PUBLIC CryptonightR_instruction_mov164 +PUBLIC CryptonightR_instruction_mov165 +PUBLIC CryptonightR_instruction_mov166 +PUBLIC CryptonightR_instruction_mov167 +PUBLIC CryptonightR_instruction_mov168 +PUBLIC CryptonightR_instruction_mov169 +PUBLIC CryptonightR_instruction_mov170 +PUBLIC CryptonightR_instruction_mov171 +PUBLIC CryptonightR_instruction_mov172 +PUBLIC CryptonightR_instruction_mov173 +PUBLIC CryptonightR_instruction_mov174 +PUBLIC CryptonightR_instruction_mov175 +PUBLIC CryptonightR_instruction_mov176 +PUBLIC CryptonightR_instruction_mov177 +PUBLIC CryptonightR_instruction_mov178 +PUBLIC CryptonightR_instruction_mov179 +PUBLIC CryptonightR_instruction_mov180 +PUBLIC CryptonightR_instruction_mov181 +PUBLIC CryptonightR_instruction_mov182 +PUBLIC CryptonightR_instruction_mov183 +PUBLIC CryptonightR_instruction_mov184 +PUBLIC CryptonightR_instruction_mov185 +PUBLIC CryptonightR_instruction_mov186 +PUBLIC CryptonightR_instruction_mov187 +PUBLIC CryptonightR_instruction_mov188 +PUBLIC CryptonightR_instruction_mov189 +PUBLIC CryptonightR_instruction_mov190 +PUBLIC CryptonightR_instruction_mov191 +PUBLIC CryptonightR_instruction_mov192 +PUBLIC CryptonightR_instruction_mov193 +PUBLIC CryptonightR_instruction_mov194 +PUBLIC CryptonightR_instruction_mov195 +PUBLIC CryptonightR_instruction_mov196 +PUBLIC CryptonightR_instruction_mov197 +PUBLIC CryptonightR_instruction_mov198 +PUBLIC CryptonightR_instruction_mov199 +PUBLIC CryptonightR_instruction_mov200 +PUBLIC CryptonightR_instruction_mov201 +PUBLIC CryptonightR_instruction_mov202 +PUBLIC CryptonightR_instruction_mov203 +PUBLIC CryptonightR_instruction_mov204 +PUBLIC CryptonightR_instruction_mov205 +PUBLIC CryptonightR_instruction_mov206 +PUBLIC CryptonightR_instruction_mov207 +PUBLIC CryptonightR_instruction_mov208 +PUBLIC CryptonightR_instruction_mov209 +PUBLIC CryptonightR_instruction_mov210 +PUBLIC CryptonightR_instruction_mov211 +PUBLIC CryptonightR_instruction_mov212 +PUBLIC CryptonightR_instruction_mov213 +PUBLIC CryptonightR_instruction_mov214 +PUBLIC CryptonightR_instruction_mov215 +PUBLIC CryptonightR_instruction_mov216 +PUBLIC CryptonightR_instruction_mov217 +PUBLIC CryptonightR_instruction_mov218 +PUBLIC CryptonightR_instruction_mov219 +PUBLIC CryptonightR_instruction_mov220 +PUBLIC CryptonightR_instruction_mov221 +PUBLIC CryptonightR_instruction_mov222 +PUBLIC CryptonightR_instruction_mov223 +PUBLIC CryptonightR_instruction_mov224 +PUBLIC CryptonightR_instruction_mov225 +PUBLIC CryptonightR_instruction_mov226 +PUBLIC CryptonightR_instruction_mov227 +PUBLIC CryptonightR_instruction_mov228 +PUBLIC CryptonightR_instruction_mov229 +PUBLIC CryptonightR_instruction_mov230 +PUBLIC CryptonightR_instruction_mov231 +PUBLIC CryptonightR_instruction_mov232 +PUBLIC CryptonightR_instruction_mov233 +PUBLIC CryptonightR_instruction_mov234 +PUBLIC CryptonightR_instruction_mov235 +PUBLIC CryptonightR_instruction_mov236 +PUBLIC CryptonightR_instruction_mov237 +PUBLIC CryptonightR_instruction_mov238 +PUBLIC CryptonightR_instruction_mov239 +PUBLIC CryptonightR_instruction_mov240 +PUBLIC CryptonightR_instruction_mov241 +PUBLIC CryptonightR_instruction_mov242 +PUBLIC CryptonightR_instruction_mov243 +PUBLIC CryptonightR_instruction_mov244 +PUBLIC CryptonightR_instruction_mov245 +PUBLIC CryptonightR_instruction_mov246 +PUBLIC CryptonightR_instruction_mov247 +PUBLIC CryptonightR_instruction_mov248 +PUBLIC CryptonightR_instruction_mov249 +PUBLIC CryptonightR_instruction_mov250 +PUBLIC CryptonightR_instruction_mov251 +PUBLIC CryptonightR_instruction_mov252 +PUBLIC CryptonightR_instruction_mov253 +PUBLIC CryptonightR_instruction_mov254 +PUBLIC CryptonightR_instruction_mov255 +PUBLIC CryptonightR_instruction_mov256 + +INCLUDE CryptonightWOW_template_win.inc +INCLUDE CryptonightR_template_win.inc +INCLUDE CryptonightWOW_soft_aes_template_win.inc +INCLUDE CryptonightR_soft_aes_template_win.inc + +CryptonightR_instruction0: + imul rbx, rbx +CryptonightR_instruction1: + imul rbx, rbx +CryptonightR_instruction2: + imul rbx, rbx +CryptonightR_instruction3: + add rbx, r9 + add rbx, 2147483647 +CryptonightR_instruction4: + sub rbx, r9 +CryptonightR_instruction5: + ror ebx, cl +CryptonightR_instruction6: + rol ebx, cl +CryptonightR_instruction7: + xor rbx, r9 +CryptonightR_instruction8: + imul rsi, rbx +CryptonightR_instruction9: + imul rsi, rbx +CryptonightR_instruction10: + imul rsi, rbx +CryptonightR_instruction11: + add rsi, rbx + add rsi, 2147483647 +CryptonightR_instruction12: + sub rsi, rbx +CryptonightR_instruction13: + ror esi, cl +CryptonightR_instruction14: + rol esi, cl +CryptonightR_instruction15: + xor rsi, rbx +CryptonightR_instruction16: + imul rdi, rbx +CryptonightR_instruction17: + imul rdi, rbx +CryptonightR_instruction18: + imul rdi, rbx +CryptonightR_instruction19: + add rdi, rbx + add rdi, 2147483647 +CryptonightR_instruction20: + sub rdi, rbx +CryptonightR_instruction21: + ror edi, cl +CryptonightR_instruction22: + rol edi, cl +CryptonightR_instruction23: + xor rdi, rbx +CryptonightR_instruction24: + imul rbp, rbx +CryptonightR_instruction25: + imul rbp, rbx +CryptonightR_instruction26: + imul rbp, rbx +CryptonightR_instruction27: + add rbp, rbx + add rbp, 2147483647 +CryptonightR_instruction28: + sub rbp, rbx +CryptonightR_instruction29: + ror ebp, cl +CryptonightR_instruction30: + rol ebp, cl +CryptonightR_instruction31: + xor rbp, rbx +CryptonightR_instruction32: + imul rbx, rsi +CryptonightR_instruction33: + imul rbx, rsi +CryptonightR_instruction34: + imul rbx, rsi +CryptonightR_instruction35: + add rbx, rsi + add rbx, 2147483647 +CryptonightR_instruction36: + sub rbx, rsi +CryptonightR_instruction37: + ror ebx, cl +CryptonightR_instruction38: + rol ebx, cl +CryptonightR_instruction39: + xor rbx, rsi +CryptonightR_instruction40: + imul rsi, rsi +CryptonightR_instruction41: + imul rsi, rsi +CryptonightR_instruction42: + imul rsi, rsi +CryptonightR_instruction43: + add rsi, r9 + add rsi, 2147483647 +CryptonightR_instruction44: + sub rsi, r9 +CryptonightR_instruction45: + ror esi, cl +CryptonightR_instruction46: + rol esi, cl +CryptonightR_instruction47: + xor rsi, r9 +CryptonightR_instruction48: + imul rdi, rsi +CryptonightR_instruction49: + imul rdi, rsi +CryptonightR_instruction50: + imul rdi, rsi +CryptonightR_instruction51: + add rdi, rsi + add rdi, 2147483647 +CryptonightR_instruction52: + sub rdi, rsi +CryptonightR_instruction53: + ror edi, cl +CryptonightR_instruction54: + rol edi, cl +CryptonightR_instruction55: + xor rdi, rsi +CryptonightR_instruction56: + imul rbp, rsi +CryptonightR_instruction57: + imul rbp, rsi +CryptonightR_instruction58: + imul rbp, rsi +CryptonightR_instruction59: + add rbp, rsi + add rbp, 2147483647 +CryptonightR_instruction60: + sub rbp, rsi +CryptonightR_instruction61: + ror ebp, cl +CryptonightR_instruction62: + rol ebp, cl +CryptonightR_instruction63: + xor rbp, rsi +CryptonightR_instruction64: + imul rbx, rdi +CryptonightR_instruction65: + imul rbx, rdi +CryptonightR_instruction66: + imul rbx, rdi +CryptonightR_instruction67: + add rbx, rdi + add rbx, 2147483647 +CryptonightR_instruction68: + sub rbx, rdi +CryptonightR_instruction69: + ror ebx, cl +CryptonightR_instruction70: + rol ebx, cl +CryptonightR_instruction71: + xor rbx, rdi +CryptonightR_instruction72: + imul rsi, rdi +CryptonightR_instruction73: + imul rsi, rdi +CryptonightR_instruction74: + imul rsi, rdi +CryptonightR_instruction75: + add rsi, rdi + add rsi, 2147483647 +CryptonightR_instruction76: + sub rsi, rdi +CryptonightR_instruction77: + ror esi, cl +CryptonightR_instruction78: + rol esi, cl +CryptonightR_instruction79: + xor rsi, rdi +CryptonightR_instruction80: + imul rdi, rdi +CryptonightR_instruction81: + imul rdi, rdi +CryptonightR_instruction82: + imul rdi, rdi +CryptonightR_instruction83: + add rdi, r9 + add rdi, 2147483647 +CryptonightR_instruction84: + sub rdi, r9 +CryptonightR_instruction85: + ror edi, cl +CryptonightR_instruction86: + rol edi, cl +CryptonightR_instruction87: + xor rdi, r9 +CryptonightR_instruction88: + imul rbp, rdi +CryptonightR_instruction89: + imul rbp, rdi +CryptonightR_instruction90: + imul rbp, rdi +CryptonightR_instruction91: + add rbp, rdi + add rbp, 2147483647 +CryptonightR_instruction92: + sub rbp, rdi +CryptonightR_instruction93: + ror ebp, cl +CryptonightR_instruction94: + rol ebp, cl +CryptonightR_instruction95: + xor rbp, rdi +CryptonightR_instruction96: + imul rbx, rbp +CryptonightR_instruction97: + imul rbx, rbp +CryptonightR_instruction98: + imul rbx, rbp +CryptonightR_instruction99: + add rbx, rbp + add rbx, 2147483647 +CryptonightR_instruction100: + sub rbx, rbp +CryptonightR_instruction101: + ror ebx, cl +CryptonightR_instruction102: + rol ebx, cl +CryptonightR_instruction103: + xor rbx, rbp +CryptonightR_instruction104: + imul rsi, rbp +CryptonightR_instruction105: + imul rsi, rbp +CryptonightR_instruction106: + imul rsi, rbp +CryptonightR_instruction107: + add rsi, rbp + add rsi, 2147483647 +CryptonightR_instruction108: + sub rsi, rbp +CryptonightR_instruction109: + ror esi, cl +CryptonightR_instruction110: + rol esi, cl +CryptonightR_instruction111: + xor rsi, rbp +CryptonightR_instruction112: + imul rdi, rbp +CryptonightR_instruction113: + imul rdi, rbp +CryptonightR_instruction114: + imul rdi, rbp +CryptonightR_instruction115: + add rdi, rbp + add rdi, 2147483647 +CryptonightR_instruction116: + sub rdi, rbp +CryptonightR_instruction117: + ror edi, cl +CryptonightR_instruction118: + rol edi, cl +CryptonightR_instruction119: + xor rdi, rbp +CryptonightR_instruction120: + imul rbp, rbp +CryptonightR_instruction121: + imul rbp, rbp +CryptonightR_instruction122: + imul rbp, rbp +CryptonightR_instruction123: + add rbp, r9 + add rbp, 2147483647 +CryptonightR_instruction124: + sub rbp, r9 +CryptonightR_instruction125: + ror ebp, cl +CryptonightR_instruction126: + rol ebp, cl +CryptonightR_instruction127: + xor rbp, r9 +CryptonightR_instruction128: + imul rbx, rsp +CryptonightR_instruction129: + imul rbx, rsp +CryptonightR_instruction130: + imul rbx, rsp +CryptonightR_instruction131: + add rbx, rsp + add rbx, 2147483647 +CryptonightR_instruction132: + sub rbx, rsp +CryptonightR_instruction133: + ror ebx, cl +CryptonightR_instruction134: + rol ebx, cl +CryptonightR_instruction135: + xor rbx, rsp +CryptonightR_instruction136: + imul rsi, rsp +CryptonightR_instruction137: + imul rsi, rsp +CryptonightR_instruction138: + imul rsi, rsp +CryptonightR_instruction139: + add rsi, rsp + add rsi, 2147483647 +CryptonightR_instruction140: + sub rsi, rsp +CryptonightR_instruction141: + ror esi, cl +CryptonightR_instruction142: + rol esi, cl +CryptonightR_instruction143: + xor rsi, rsp +CryptonightR_instruction144: + imul rdi, rsp +CryptonightR_instruction145: + imul rdi, rsp +CryptonightR_instruction146: + imul rdi, rsp +CryptonightR_instruction147: + add rdi, rsp + add rdi, 2147483647 +CryptonightR_instruction148: + sub rdi, rsp +CryptonightR_instruction149: + ror edi, cl +CryptonightR_instruction150: + rol edi, cl +CryptonightR_instruction151: + xor rdi, rsp +CryptonightR_instruction152: + imul rbp, rsp +CryptonightR_instruction153: + imul rbp, rsp +CryptonightR_instruction154: + imul rbp, rsp +CryptonightR_instruction155: + add rbp, rsp + add rbp, 2147483647 +CryptonightR_instruction156: + sub rbp, rsp +CryptonightR_instruction157: + ror ebp, cl +CryptonightR_instruction158: + rol ebp, cl +CryptonightR_instruction159: + xor rbp, rsp +CryptonightR_instruction160: + imul rbx, r15 +CryptonightR_instruction161: + imul rbx, r15 +CryptonightR_instruction162: + imul rbx, r15 +CryptonightR_instruction163: + add rbx, r15 + add rbx, 2147483647 +CryptonightR_instruction164: + sub rbx, r15 +CryptonightR_instruction165: + ror ebx, cl +CryptonightR_instruction166: + rol ebx, cl +CryptonightR_instruction167: + xor rbx, r15 +CryptonightR_instruction168: + imul rsi, r15 +CryptonightR_instruction169: + imul rsi, r15 +CryptonightR_instruction170: + imul rsi, r15 +CryptonightR_instruction171: + add rsi, r15 + add rsi, 2147483647 +CryptonightR_instruction172: + sub rsi, r15 +CryptonightR_instruction173: + ror esi, cl +CryptonightR_instruction174: + rol esi, cl +CryptonightR_instruction175: + xor rsi, r15 +CryptonightR_instruction176: + imul rdi, r15 +CryptonightR_instruction177: + imul rdi, r15 +CryptonightR_instruction178: + imul rdi, r15 +CryptonightR_instruction179: + add rdi, r15 + add rdi, 2147483647 +CryptonightR_instruction180: + sub rdi, r15 +CryptonightR_instruction181: + ror edi, cl +CryptonightR_instruction182: + rol edi, cl +CryptonightR_instruction183: + xor rdi, r15 +CryptonightR_instruction184: + imul rbp, r15 +CryptonightR_instruction185: + imul rbp, r15 +CryptonightR_instruction186: + imul rbp, r15 +CryptonightR_instruction187: + add rbp, r15 + add rbp, 2147483647 +CryptonightR_instruction188: + sub rbp, r15 +CryptonightR_instruction189: + ror ebp, cl +CryptonightR_instruction190: + rol ebp, cl +CryptonightR_instruction191: + xor rbp, r15 +CryptonightR_instruction192: + imul rbx, rax +CryptonightR_instruction193: + imul rbx, rax +CryptonightR_instruction194: + imul rbx, rax +CryptonightR_instruction195: + add rbx, rax + add rbx, 2147483647 +CryptonightR_instruction196: + sub rbx, rax +CryptonightR_instruction197: + ror ebx, cl +CryptonightR_instruction198: + rol ebx, cl +CryptonightR_instruction199: + xor rbx, rax +CryptonightR_instruction200: + imul rsi, rax +CryptonightR_instruction201: + imul rsi, rax +CryptonightR_instruction202: + imul rsi, rax +CryptonightR_instruction203: + add rsi, rax + add rsi, 2147483647 +CryptonightR_instruction204: + sub rsi, rax +CryptonightR_instruction205: + ror esi, cl +CryptonightR_instruction206: + rol esi, cl +CryptonightR_instruction207: + xor rsi, rax +CryptonightR_instruction208: + imul rdi, rax +CryptonightR_instruction209: + imul rdi, rax +CryptonightR_instruction210: + imul rdi, rax +CryptonightR_instruction211: + add rdi, rax + add rdi, 2147483647 +CryptonightR_instruction212: + sub rdi, rax +CryptonightR_instruction213: + ror edi, cl +CryptonightR_instruction214: + rol edi, cl +CryptonightR_instruction215: + xor rdi, rax +CryptonightR_instruction216: + imul rbp, rax +CryptonightR_instruction217: + imul rbp, rax +CryptonightR_instruction218: + imul rbp, rax +CryptonightR_instruction219: + add rbp, rax + add rbp, 2147483647 +CryptonightR_instruction220: + sub rbp, rax +CryptonightR_instruction221: + ror ebp, cl +CryptonightR_instruction222: + rol ebp, cl +CryptonightR_instruction223: + xor rbp, rax +CryptonightR_instruction224: + imul rbx, rdx +CryptonightR_instruction225: + imul rbx, rdx +CryptonightR_instruction226: + imul rbx, rdx +CryptonightR_instruction227: + add rbx, rdx + add rbx, 2147483647 +CryptonightR_instruction228: + sub rbx, rdx +CryptonightR_instruction229: + ror ebx, cl +CryptonightR_instruction230: + rol ebx, cl +CryptonightR_instruction231: + xor rbx, rdx +CryptonightR_instruction232: + imul rsi, rdx +CryptonightR_instruction233: + imul rsi, rdx +CryptonightR_instruction234: + imul rsi, rdx +CryptonightR_instruction235: + add rsi, rdx + add rsi, 2147483647 +CryptonightR_instruction236: + sub rsi, rdx +CryptonightR_instruction237: + ror esi, cl +CryptonightR_instruction238: + rol esi, cl +CryptonightR_instruction239: + xor rsi, rdx +CryptonightR_instruction240: + imul rdi, rdx +CryptonightR_instruction241: + imul rdi, rdx +CryptonightR_instruction242: + imul rdi, rdx +CryptonightR_instruction243: + add rdi, rdx + add rdi, 2147483647 +CryptonightR_instruction244: + sub rdi, rdx +CryptonightR_instruction245: + ror edi, cl +CryptonightR_instruction246: + rol edi, cl +CryptonightR_instruction247: + xor rdi, rdx +CryptonightR_instruction248: + imul rbp, rdx +CryptonightR_instruction249: + imul rbp, rdx +CryptonightR_instruction250: + imul rbp, rdx +CryptonightR_instruction251: + add rbp, rdx + add rbp, 2147483647 +CryptonightR_instruction252: + sub rbp, rdx +CryptonightR_instruction253: + ror ebp, cl +CryptonightR_instruction254: + rol ebp, cl +CryptonightR_instruction255: + xor rbp, rdx +CryptonightR_instruction256: + imul rbx, rbx +CryptonightR_instruction_mov0: + +CryptonightR_instruction_mov1: + +CryptonightR_instruction_mov2: + +CryptonightR_instruction_mov3: + +CryptonightR_instruction_mov4: + +CryptonightR_instruction_mov5: + mov rcx, rbx +CryptonightR_instruction_mov6: + mov rcx, rbx +CryptonightR_instruction_mov7: + +CryptonightR_instruction_mov8: + +CryptonightR_instruction_mov9: + +CryptonightR_instruction_mov10: + +CryptonightR_instruction_mov11: + +CryptonightR_instruction_mov12: + +CryptonightR_instruction_mov13: + mov rcx, rbx +CryptonightR_instruction_mov14: + mov rcx, rbx +CryptonightR_instruction_mov15: + +CryptonightR_instruction_mov16: + +CryptonightR_instruction_mov17: + +CryptonightR_instruction_mov18: + +CryptonightR_instruction_mov19: + +CryptonightR_instruction_mov20: + +CryptonightR_instruction_mov21: + mov rcx, rbx +CryptonightR_instruction_mov22: + mov rcx, rbx +CryptonightR_instruction_mov23: + +CryptonightR_instruction_mov24: + +CryptonightR_instruction_mov25: + +CryptonightR_instruction_mov26: + +CryptonightR_instruction_mov27: + +CryptonightR_instruction_mov28: + +CryptonightR_instruction_mov29: + mov rcx, rbx +CryptonightR_instruction_mov30: + mov rcx, rbx +CryptonightR_instruction_mov31: + +CryptonightR_instruction_mov32: + +CryptonightR_instruction_mov33: + +CryptonightR_instruction_mov34: + +CryptonightR_instruction_mov35: + +CryptonightR_instruction_mov36: + +CryptonightR_instruction_mov37: + mov rcx, rsi +CryptonightR_instruction_mov38: + mov rcx, rsi +CryptonightR_instruction_mov39: + +CryptonightR_instruction_mov40: + +CryptonightR_instruction_mov41: + +CryptonightR_instruction_mov42: + +CryptonightR_instruction_mov43: + +CryptonightR_instruction_mov44: + +CryptonightR_instruction_mov45: + mov rcx, rsi +CryptonightR_instruction_mov46: + mov rcx, rsi +CryptonightR_instruction_mov47: + +CryptonightR_instruction_mov48: + +CryptonightR_instruction_mov49: + +CryptonightR_instruction_mov50: + +CryptonightR_instruction_mov51: + +CryptonightR_instruction_mov52: + +CryptonightR_instruction_mov53: + mov rcx, rsi +CryptonightR_instruction_mov54: + mov rcx, rsi +CryptonightR_instruction_mov55: + +CryptonightR_instruction_mov56: + +CryptonightR_instruction_mov57: + +CryptonightR_instruction_mov58: + +CryptonightR_instruction_mov59: + +CryptonightR_instruction_mov60: + +CryptonightR_instruction_mov61: + mov rcx, rsi +CryptonightR_instruction_mov62: + mov rcx, rsi +CryptonightR_instruction_mov63: + +CryptonightR_instruction_mov64: + +CryptonightR_instruction_mov65: + +CryptonightR_instruction_mov66: + +CryptonightR_instruction_mov67: + +CryptonightR_instruction_mov68: + +CryptonightR_instruction_mov69: + mov rcx, rdi +CryptonightR_instruction_mov70: + mov rcx, rdi +CryptonightR_instruction_mov71: + +CryptonightR_instruction_mov72: + +CryptonightR_instruction_mov73: + +CryptonightR_instruction_mov74: + +CryptonightR_instruction_mov75: + +CryptonightR_instruction_mov76: + +CryptonightR_instruction_mov77: + mov rcx, rdi +CryptonightR_instruction_mov78: + mov rcx, rdi +CryptonightR_instruction_mov79: + +CryptonightR_instruction_mov80: + +CryptonightR_instruction_mov81: + +CryptonightR_instruction_mov82: + +CryptonightR_instruction_mov83: + +CryptonightR_instruction_mov84: + +CryptonightR_instruction_mov85: + mov rcx, rdi +CryptonightR_instruction_mov86: + mov rcx, rdi +CryptonightR_instruction_mov87: + +CryptonightR_instruction_mov88: + +CryptonightR_instruction_mov89: + +CryptonightR_instruction_mov90: + +CryptonightR_instruction_mov91: + +CryptonightR_instruction_mov92: + +CryptonightR_instruction_mov93: + mov rcx, rdi +CryptonightR_instruction_mov94: + mov rcx, rdi +CryptonightR_instruction_mov95: + +CryptonightR_instruction_mov96: + +CryptonightR_instruction_mov97: + +CryptonightR_instruction_mov98: + +CryptonightR_instruction_mov99: + +CryptonightR_instruction_mov100: + +CryptonightR_instruction_mov101: + mov rcx, rbp +CryptonightR_instruction_mov102: + mov rcx, rbp +CryptonightR_instruction_mov103: + +CryptonightR_instruction_mov104: + +CryptonightR_instruction_mov105: + +CryptonightR_instruction_mov106: + +CryptonightR_instruction_mov107: + +CryptonightR_instruction_mov108: + +CryptonightR_instruction_mov109: + mov rcx, rbp +CryptonightR_instruction_mov110: + mov rcx, rbp +CryptonightR_instruction_mov111: + +CryptonightR_instruction_mov112: + +CryptonightR_instruction_mov113: + +CryptonightR_instruction_mov114: + +CryptonightR_instruction_mov115: + +CryptonightR_instruction_mov116: + +CryptonightR_instruction_mov117: + mov rcx, rbp +CryptonightR_instruction_mov118: + mov rcx, rbp +CryptonightR_instruction_mov119: + +CryptonightR_instruction_mov120: + +CryptonightR_instruction_mov121: + +CryptonightR_instruction_mov122: + +CryptonightR_instruction_mov123: + +CryptonightR_instruction_mov124: + +CryptonightR_instruction_mov125: + mov rcx, rbp +CryptonightR_instruction_mov126: + mov rcx, rbp +CryptonightR_instruction_mov127: + +CryptonightR_instruction_mov128: + +CryptonightR_instruction_mov129: + +CryptonightR_instruction_mov130: + +CryptonightR_instruction_mov131: + +CryptonightR_instruction_mov132: + +CryptonightR_instruction_mov133: + mov rcx, rsp +CryptonightR_instruction_mov134: + mov rcx, rsp +CryptonightR_instruction_mov135: + +CryptonightR_instruction_mov136: + +CryptonightR_instruction_mov137: + +CryptonightR_instruction_mov138: + +CryptonightR_instruction_mov139: + +CryptonightR_instruction_mov140: + +CryptonightR_instruction_mov141: + mov rcx, rsp +CryptonightR_instruction_mov142: + mov rcx, rsp +CryptonightR_instruction_mov143: + +CryptonightR_instruction_mov144: + +CryptonightR_instruction_mov145: + +CryptonightR_instruction_mov146: + +CryptonightR_instruction_mov147: + +CryptonightR_instruction_mov148: + +CryptonightR_instruction_mov149: + mov rcx, rsp +CryptonightR_instruction_mov150: + mov rcx, rsp +CryptonightR_instruction_mov151: + +CryptonightR_instruction_mov152: + +CryptonightR_instruction_mov153: + +CryptonightR_instruction_mov154: + +CryptonightR_instruction_mov155: + +CryptonightR_instruction_mov156: + +CryptonightR_instruction_mov157: + mov rcx, rsp +CryptonightR_instruction_mov158: + mov rcx, rsp +CryptonightR_instruction_mov159: + +CryptonightR_instruction_mov160: + +CryptonightR_instruction_mov161: + +CryptonightR_instruction_mov162: + +CryptonightR_instruction_mov163: + +CryptonightR_instruction_mov164: + +CryptonightR_instruction_mov165: + mov rcx, r15 +CryptonightR_instruction_mov166: + mov rcx, r15 +CryptonightR_instruction_mov167: + +CryptonightR_instruction_mov168: + +CryptonightR_instruction_mov169: + +CryptonightR_instruction_mov170: + +CryptonightR_instruction_mov171: + +CryptonightR_instruction_mov172: + +CryptonightR_instruction_mov173: + mov rcx, r15 +CryptonightR_instruction_mov174: + mov rcx, r15 +CryptonightR_instruction_mov175: + +CryptonightR_instruction_mov176: + +CryptonightR_instruction_mov177: + +CryptonightR_instruction_mov178: + +CryptonightR_instruction_mov179: + +CryptonightR_instruction_mov180: + +CryptonightR_instruction_mov181: + mov rcx, r15 +CryptonightR_instruction_mov182: + mov rcx, r15 +CryptonightR_instruction_mov183: + +CryptonightR_instruction_mov184: + +CryptonightR_instruction_mov185: + +CryptonightR_instruction_mov186: + +CryptonightR_instruction_mov187: + +CryptonightR_instruction_mov188: + +CryptonightR_instruction_mov189: + mov rcx, r15 +CryptonightR_instruction_mov190: + mov rcx, r15 +CryptonightR_instruction_mov191: + +CryptonightR_instruction_mov192: + +CryptonightR_instruction_mov193: + +CryptonightR_instruction_mov194: + +CryptonightR_instruction_mov195: + +CryptonightR_instruction_mov196: + +CryptonightR_instruction_mov197: + mov rcx, rax +CryptonightR_instruction_mov198: + mov rcx, rax +CryptonightR_instruction_mov199: + +CryptonightR_instruction_mov200: + +CryptonightR_instruction_mov201: + +CryptonightR_instruction_mov202: + +CryptonightR_instruction_mov203: + +CryptonightR_instruction_mov204: + +CryptonightR_instruction_mov205: + mov rcx, rax +CryptonightR_instruction_mov206: + mov rcx, rax +CryptonightR_instruction_mov207: + +CryptonightR_instruction_mov208: + +CryptonightR_instruction_mov209: + +CryptonightR_instruction_mov210: + +CryptonightR_instruction_mov211: + +CryptonightR_instruction_mov212: + +CryptonightR_instruction_mov213: + mov rcx, rax +CryptonightR_instruction_mov214: + mov rcx, rax +CryptonightR_instruction_mov215: + +CryptonightR_instruction_mov216: + +CryptonightR_instruction_mov217: + +CryptonightR_instruction_mov218: + +CryptonightR_instruction_mov219: + +CryptonightR_instruction_mov220: + +CryptonightR_instruction_mov221: + mov rcx, rax +CryptonightR_instruction_mov222: + mov rcx, rax +CryptonightR_instruction_mov223: + +CryptonightR_instruction_mov224: + +CryptonightR_instruction_mov225: + +CryptonightR_instruction_mov226: + +CryptonightR_instruction_mov227: + +CryptonightR_instruction_mov228: + +CryptonightR_instruction_mov229: + mov rcx, rdx +CryptonightR_instruction_mov230: + mov rcx, rdx +CryptonightR_instruction_mov231: + +CryptonightR_instruction_mov232: + +CryptonightR_instruction_mov233: + +CryptonightR_instruction_mov234: + +CryptonightR_instruction_mov235: + +CryptonightR_instruction_mov236: + +CryptonightR_instruction_mov237: + mov rcx, rdx +CryptonightR_instruction_mov238: + mov rcx, rdx +CryptonightR_instruction_mov239: + +CryptonightR_instruction_mov240: + +CryptonightR_instruction_mov241: + +CryptonightR_instruction_mov242: + +CryptonightR_instruction_mov243: + +CryptonightR_instruction_mov244: + +CryptonightR_instruction_mov245: + mov rcx, rdx +CryptonightR_instruction_mov246: + mov rcx, rdx +CryptonightR_instruction_mov247: + +CryptonightR_instruction_mov248: + +CryptonightR_instruction_mov249: + +CryptonightR_instruction_mov250: + +CryptonightR_instruction_mov251: + +CryptonightR_instruction_mov252: + +CryptonightR_instruction_mov253: + mov rcx, rdx +CryptonightR_instruction_mov254: + mov rcx, rdx +CryptonightR_instruction_mov255: + +CryptonightR_instruction_mov256: + +_TEXT_CN_TEMPLATE ENDS +END diff --git a/src/crypto/asm/win/CryptonightR_template.inc b/src/crypto/asm/win/CryptonightR_template.inc new file mode 100644 index 00000000..1dae434a --- /dev/null +++ b/src/crypto/asm/win/CryptonightR_template.inc @@ -0,0 +1,529 @@ +PUBLIC FN_PREFIX(CryptonightR_template_part1) +PUBLIC FN_PREFIX(CryptonightR_template_mainloop) +PUBLIC FN_PREFIX(CryptonightR_template_part2) +PUBLIC FN_PREFIX(CryptonightR_template_part3) +PUBLIC FN_PREFIX(CryptonightR_template_end) +PUBLIC FN_PREFIX(CryptonightR_template_double_part1) +PUBLIC FN_PREFIX(CryptonightR_template_double_mainloop) +PUBLIC FN_PREFIX(CryptonightR_template_double_part2) +PUBLIC FN_PREFIX(CryptonightR_template_double_part3) +PUBLIC FN_PREFIX(CryptonightR_template_double_part4) +PUBLIC FN_PREFIX(CryptonightR_template_double_end) + +ALIGN(64) +FN_PREFIX(CryptonightR_template_part1): + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movd xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movd xmm0, r12 + movd xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movd xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +FN_PREFIX(CryptonightR_template_mainloop): + movdqa xmm5, XMMWORD PTR [r9+r11] + movd xmm0, r15 + movd xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movd r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +FN_PREFIX(CryptonightR_template_part2): + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov rax, r13 + mul r12 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + movaps xmm3, xmm1 + movdqa xmm2, XMMWORD PTR [r9+r11] + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz FN_PREFIX(CryptonightR_template_mainloop) + +FN_PREFIX(CryptonightR_template_part3): + movd rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +FN_PREFIX(CryptonightR_template_end): + +ALIGN(64) +FN_PREFIX(CryptonightR_template_double_part1): + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movd xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movd xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movd xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movd xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movd xmm0, rcx + mov r11d, 524288 + movd xmm10, rax + punpcklqdq xmm10, xmm0 + + movd xmm14, QWORD PTR [rsp+128] + movd xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +FN_PREFIX(CryptonightR_template_double_mainloop): + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movd xmm0, r12 + mov ecx, ebx + movd xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movd xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movd rdx, xmm6 + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movd xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movd rdi, xmm5 + movd rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movd xmm0, rsp + movd xmm1, rsi + movd xmm2, rdi + movd xmm11, rbp + movd xmm12, r15 + movd xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +FN_PREFIX(CryptonightR_template_double_part2): + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + + movd rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movd rsi, xmm1 + movd rdi, xmm2 + movd rbp, xmm11 + movd r15, xmm12 + movd rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + pxor xmm6, xmm2 + paddq xmm2, xmm3 + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movd rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movd xmm0, rsp + movd xmm1, rbx + movd xmm2, rsi + movd xmm11, rdi + movd xmm12, rbp + movd xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movd xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +FN_PREFIX(CryptonightR_template_double_part3): + + movd r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + + movd rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movd rbx, xmm1 + movd rsi, xmm2 + movd rdi, xmm11 + movd rbp, xmm12 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + mov rdi, rcx + mov r8, rax + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 + xor ebp, 48 + paddq xmm1, xmm8 + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movd rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz FN_PREFIX(CryptonightR_template_double_mainloop) + +FN_PREFIX(CryptonightR_template_double_part4): + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +FN_PREFIX(CryptonightR_template_double_end): diff --git a/src/crypto/asm/win/CryptonightR_template_win.inc b/src/crypto/asm/win/CryptonightR_template_win.inc new file mode 100644 index 00000000..2f2d71a2 --- /dev/null +++ b/src/crypto/asm/win/CryptonightR_template_win.inc @@ -0,0 +1,529 @@ +PUBLIC CryptonightR_template_part1 +PUBLIC CryptonightR_template_mainloop +PUBLIC CryptonightR_template_part2 +PUBLIC CryptonightR_template_part3 +PUBLIC CryptonightR_template_end +PUBLIC CryptonightR_template_double_part1 +PUBLIC CryptonightR_template_double_mainloop +PUBLIC CryptonightR_template_double_part2 +PUBLIC CryptonightR_template_double_part3 +PUBLIC CryptonightR_template_double_part4 +PUBLIC CryptonightR_template_double_end + +ALIGN(64) +CryptonightR_template_part1: + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movd xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movd xmm0, r12 + movd xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movd xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +CryptonightR_template_mainloop: + movdqa xmm5, XMMWORD PTR [r9+r11] + movd xmm0, r15 + movd xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movaps xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + pxor xmm0, xmm2 + pxor xmm5, xmm1 + pxor xmm5, xmm0 + paddq xmm3, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movd r12, xmm5 + movd r10d, xmm5 + and r10d, 2097136 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +CryptonightR_template_part2: + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor rsp, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov rax, r13 + mul r12 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + movaps xmm3, xmm1 + movdqa xmm2, XMMWORD PTR [r9+r11] + movdqa xmm0, XMMWORD PTR [r10+r11] + pxor xmm1, xmm2 + pxor xmm5, xmm0 + pxor xmm5, xmm1 + paddq xmm3, xmm4 + paddq xmm2, xmm6 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqu XMMWORD PTR [r12+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm3 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz CryptonightR_template_mainloop + +CryptonightR_template_part3: + movd rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +CryptonightR_template_end: + +ALIGN(64) +CryptonightR_template_double_part1: + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movd xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movd xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movd xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movd xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movd xmm0, rcx + mov r11d, 524288 + movd xmm10, rax + punpcklqdq xmm10, xmm0 + + movd xmm14, QWORD PTR [rsp+128] + movd xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +CryptonightR_template_double_mainloop: + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movd xmm0, r12 + mov ecx, ebx + movd xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movd xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm1 + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + pxor xmm6, xmm0 + movd rdx, xmm6 + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movd xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm1 + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movd rdi, xmm5 + movd rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movd xmm0, rsp + movd xmm1, rsi + movd xmm2, rdi + movd xmm11, rbp + movd xmm12, r15 + movd xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +CryptonightR_template_double_part2: + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r14, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r12, rax + + movd rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movd rsi, xmm1 + movd rdi, xmm2 + movd rbp, xmm11 + movd r15, xmm12 + movd rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movdqu xmm1, XMMWORD PTR [rcx+rsi] + pxor xmm6, xmm1 + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + pxor xmm6, xmm2 + paddq xmm2, xmm3 + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + pxor xmm6, xmm0 + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movd rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movd xmm0, rsp + movd xmm1, rbx + movd xmm2, rsi + movd xmm11, rdi + movd xmm12, rbp + movd xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movd xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +CryptonightR_template_double_part3: + + movd r15, xmm13 + + mov eax, edi + mov edx, ebp + shl rdx, 32 + or rax, rdx + xor r15, rax + + mov eax, ebx + mov edx, esi + shl rdx, 32 + or rax, rdx + xor r13, rax + + movd rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movd rbx, xmm1 + movd rsi, xmm2 + movd rdi, xmm11 + movd rbp, xmm12 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + mov rdi, rcx + mov r8, rax + movdqu xmm1, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm1 + xor ebp, 48 + paddq xmm1, xmm8 + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm2 + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + pxor xmm5, xmm0 + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movd rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz CryptonightR_template_double_mainloop + +CryptonightR_template_double_part4: + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +CryptonightR_template_double_end: diff --git a/src/crypto/asm/win/CryptonightWOW_soft_aes_template.inc b/src/crypto/asm/win/CryptonightWOW_soft_aes_template.inc new file mode 100644 index 00000000..cc273781 --- /dev/null +++ b/src/crypto/asm/win/CryptonightWOW_soft_aes_template.inc @@ -0,0 +1,266 @@ +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part1) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part2) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_part3) +PUBLIC FN_PREFIX(CryptonightWOW_soft_aes_template_end) + +ALIGN(64) +FN_PREFIX(CryptonightWOW_soft_aes_template_part1): + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 232 + + mov eax, [rcx+96] + mov ebx, [rcx+100] + mov esi, [rcx+104] + mov edx, [rcx+108] + mov [rsp+144], eax + mov [rsp+148], ebx + mov [rsp+152], esi + mov [rsp+156], edx + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movd xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movd xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movd xmm5, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movd xmm10, QWORD PTR [r10+96] + movd xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+328], rax + movd xmm12, r11 + mov QWORD PTR [rsp+320], r9 + punpcklqdq xmm5, xmm0 + movd xmm13, rcx + mov r12d, 524288 + + ALIGN(64) +FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop): + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movd xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movd xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+328] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movd r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movd rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movd rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [r13], xmm0 + + mov ebx, [rsp+144] + mov ebp, [rsp+152] + add ebx, [rsp+148] + add ebp, [rsp+156] + shl rbp, 32 + or rbx, rbp + + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + + mov [rsp+160], rbx + mov [rsp+168], rdi + mov [rsp+176], rbp + mov [rsp+184], r10 + mov r10, rsp + + mov ebx, [rsp+144] + mov esi, [rsp+148] + mov edi, [rsp+152] + mov ebp, [rsp+156] + + movd esp, xmm7 + movaps xmm0, xmm7 + psrldq xmm0, 8 + movd r15d, xmm0 + movd eax, xmm4 + movd edx, xmm5 + +FN_PREFIX(CryptonightWOW_soft_aes_template_part2): + mov rsp, r10 + mov [rsp+144], ebx + mov [rsp+148], esi + mov [rsp+152], edi + mov [rsp+156], ebp + + mov rbx, [rsp+160] + mov rdi, [rsp+168] + mov rbp, [rsp+176] + mov r10, [rsp+184] + + mov r9, r10 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movd xmm0, rax + movd xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+320] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+304] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+320], r9 + mov QWORD PTR [rsp+328], rax + sub r12d, 1 + jne FN_PREFIX(CryptonightWOW_soft_aes_template_mainloop) + +FN_PREFIX(CryptonightWOW_soft_aes_template_part3): + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 232 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +FN_PREFIX(CryptonightWOW_soft_aes_template_end): diff --git a/src/crypto/asm/win/CryptonightWOW_soft_aes_template_win.inc b/src/crypto/asm/win/CryptonightWOW_soft_aes_template_win.inc new file mode 100644 index 00000000..68209036 --- /dev/null +++ b/src/crypto/asm/win/CryptonightWOW_soft_aes_template_win.inc @@ -0,0 +1,266 @@ +PUBLIC CryptonightWOW_soft_aes_template_part1 +PUBLIC CryptonightWOW_soft_aes_template_mainloop +PUBLIC CryptonightWOW_soft_aes_template_part2 +PUBLIC CryptonightWOW_soft_aes_template_part3 +PUBLIC CryptonightWOW_soft_aes_template_end + +ALIGN(64) +CryptonightWOW_soft_aes_template_part1: + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 232 + + mov eax, [rcx+96] + mov ebx, [rcx+100] + mov esi, [rcx+104] + mov edx, [rcx+108] + mov [rsp+144], eax + mov [rsp+148], ebx + mov [rsp+152], esi + mov [rsp+156], edx + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movd xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movd xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movd xmm5, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movd xmm10, QWORD PTR [r10+96] + movd xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+328], rax + movd xmm12, r11 + mov QWORD PTR [rsp+320], r9 + punpcklqdq xmm5, xmm0 + movd xmm13, rcx + mov r12d, 524288 + + ALIGN(64) +CryptonightWOW_soft_aes_template_mainloop: + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movd xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movd xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+328] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movd r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movd rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movd rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [r13], xmm0 + + mov ebx, [rsp+144] + mov ebp, [rsp+152] + add ebx, [rsp+148] + add ebp, [rsp+156] + shl rbp, 32 + or rbx, rbp + + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + + mov [rsp+160], rbx + mov [rsp+168], rdi + mov [rsp+176], rbp + mov [rsp+184], r10 + mov r10, rsp + + mov ebx, [rsp+144] + mov esi, [rsp+148] + mov edi, [rsp+152] + mov ebp, [rsp+156] + + movd esp, xmm7 + movaps xmm0, xmm7 + psrldq xmm0, 8 + movd r15d, xmm0 + movd eax, xmm4 + movd edx, xmm5 + +CryptonightWOW_soft_aes_template_part2: + mov rsp, r10 + mov [rsp+144], ebx + mov [rsp+148], esi + mov [rsp+152], edi + mov [rsp+156], ebp + + mov rbx, [rsp+160] + mov rdi, [rsp+168] + mov rbp, [rsp+176] + mov r10, [rsp+184] + + mov r9, r10 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movd xmm0, rax + movd xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+320] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+304] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+320], r9 + mov QWORD PTR [rsp+328], rax + sub r12d, 1 + jne CryptonightWOW_soft_aes_template_mainloop + +CryptonightWOW_soft_aes_template_part3: + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 232 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + ret +CryptonightWOW_soft_aes_template_end: diff --git a/src/crypto/asm/win/CryptonightWOW_template.inc b/src/crypto/asm/win/CryptonightWOW_template.inc new file mode 100644 index 00000000..47fbc94f --- /dev/null +++ b/src/crypto/asm/win/CryptonightWOW_template.inc @@ -0,0 +1,486 @@ +PUBLIC FN_PREFIX(CryptonightWOW_template_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_end) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part1) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_mainloop) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part2) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part3) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_part4) +PUBLIC FN_PREFIX(CryptonightWOW_template_double_end) + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_part1): + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movd xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movd xmm0, r12 + movd xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movd xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_mainloop): + movdqa xmm5, XMMWORD PTR [r9+r11] + movd xmm0, r15 + movd xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + movd r10d, xmm5 + and r10d, 2097136 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + paddq xmm0, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm0 + movd r12, xmm5 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +FN_PREFIX(CryptonightWOW_template_part2): + mov rax, r13 + mul r12 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + xor rdx, QWORD PTR [r12+r11] + xor rax, QWORD PTR [r11+r12+8] + movdqa xmm2, XMMWORD PTR [r9+r11] + pxor xmm3, xmm2 + paddq xmm7, XMMWORD PTR [r10+r11] + paddq xmm1, xmm4 + paddq xmm3, xmm6 + movdqu XMMWORD PTR [r9+r11], xmm7 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [r10+r11], xmm1 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz FN_PREFIX(CryptonightWOW_template_mainloop) + +FN_PREFIX(CryptonightWOW_template_part3): + movd rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +FN_PREFIX(CryptonightWOW_template_end): + +ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_part1): + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movd xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movd xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movd xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movd xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movd xmm0, rcx + mov r11d, 524288 + movd xmm10, rax + punpcklqdq xmm10, xmm0 + + movd xmm14, QWORD PTR [rsp+128] + movd xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +FN_PREFIX(CryptonightWOW_template_double_mainloop): + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movd xmm0, r12 + mov ecx, ebx + movd xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movd rdx, xmm6 + movd xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movd xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movd rdi, xmm5 + movd rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movd xmm0, rsp + movd xmm1, rsi + movd xmm2, rdi + movd xmm11, rbp + movd xmm12, r15 + movd xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +FN_PREFIX(CryptonightWOW_template_double_part2): + + movd rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movd rsi, xmm1 + movd rdi, xmm2 + movd rbp, xmm11 + movd r15, xmm12 + movd rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movd xmm1, rdx + movd xmm0, r8 + punpcklqdq xmm1, xmm0 + pxor xmm1, XMMWORD PTR [rcx+rsi] + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + xor rdx, QWORD PTR [rsi+rcx] + paddq xmm2, xmm3 + xor r8, QWORD PTR [rsi+rcx+8] + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movd rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movd xmm0, rsp + movd xmm1, rbx + movd xmm2, rsi + movd xmm11, rdi + movd xmm12, rbp + movd xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movd xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +FN_PREFIX(CryptonightWOW_template_double_part3): + + movd rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movd rbx, xmm1 + movd rsi, xmm2 + movd rdi, xmm11 + movd rbp, xmm12 + movd r15, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + movd xmm1, rdx + movd xmm0, rax + punpcklqdq xmm1, xmm0 + mov rdi, rcx + mov r8, rax + pxor xmm1, XMMWORD PTR [rbp+rcx] + xor ebp, 48 + paddq xmm1, xmm8 + xor r8, QWORD PTR [rbp+rcx+8] + xor rdx, QWORD PTR [rbp+rcx] + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movd rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz FN_PREFIX(CryptonightWOW_template_double_mainloop) + +FN_PREFIX(CryptonightWOW_template_double_part4): + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +FN_PREFIX(CryptonightWOW_template_double_end): diff --git a/src/crypto/asm/win/CryptonightWOW_template_win.inc b/src/crypto/asm/win/CryptonightWOW_template_win.inc new file mode 100644 index 00000000..9db2cf39 --- /dev/null +++ b/src/crypto/asm/win/CryptonightWOW_template_win.inc @@ -0,0 +1,486 @@ +PUBLIC CryptonightWOW_template_part1 +PUBLIC CryptonightWOW_template_mainloop +PUBLIC CryptonightWOW_template_part2 +PUBLIC CryptonightWOW_template_part3 +PUBLIC CryptonightWOW_template_end +PUBLIC CryptonightWOW_template_double_part1 +PUBLIC CryptonightWOW_template_double_mainloop +PUBLIC CryptonightWOW_template_double_part2 +PUBLIC CryptonightWOW_template_double_part3 +PUBLIC CryptonightWOW_template_double_part4 +PUBLIC CryptonightWOW_template_double_end + +ALIGN(64) +CryptonightWOW_template_part1: + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + push rdi + sub rsp, 64 + mov r12, rcx + mov r8, QWORD PTR [r12+32] + mov rdx, r12 + xor r8, QWORD PTR [r12] + mov r15, QWORD PTR [r12+40] + mov r9, r8 + xor r15, QWORD PTR [r12+8] + mov r11, QWORD PTR [r12+224] + mov r12, QWORD PTR [r12+56] + xor r12, QWORD PTR [rdx+24] + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm0, r12 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + movaps XMMWORD PTR [rsp], xmm9 + mov r12, QWORD PTR [rdx+88] + xor r12, QWORD PTR [rdx+72] + movd xmm6, rax + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm6, xmm0 + and r9d, 2097136 + movd xmm0, r12 + movd xmm7, rax + punpcklqdq xmm7, xmm0 + mov r10d, r9d + movd xmm9, rsp + mov rsp, r8 + mov r8d, 524288 + + mov ebx, [rdx+96] + mov esi, [rdx+100] + mov edi, [rdx+104] + mov ebp, [rdx+108] + + ALIGN(64) +CryptonightWOW_template_mainloop: + movdqa xmm5, XMMWORD PTR [r9+r11] + movd xmm0, r15 + movd xmm4, rsp + punpcklqdq xmm4, xmm0 + lea rdx, QWORD PTR [r9+r11] + + aesenc xmm5, xmm4 + movd r10d, xmm5 + and r10d, 2097136 + + mov r12d, r9d + mov eax, r9d + xor r9d, 48 + xor r12d, 16 + xor eax, 32 + movdqu xmm0, XMMWORD PTR [r9+r11] + movdqu xmm2, XMMWORD PTR [r12+r11] + movdqu xmm1, XMMWORD PTR [rax+r11] + paddq xmm0, xmm7 + paddq xmm2, xmm6 + paddq xmm1, xmm4 + movdqu XMMWORD PTR [r12+r11], xmm0 + movd r12, xmm5 + movdqu XMMWORD PTR [rax+r11], xmm2 + movdqu XMMWORD PTR [r9+r11], xmm1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [rdx], xmm0 + + lea r13d, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or r13, rdx + + xor r13, QWORD PTR [r10+r11] + mov r14, QWORD PTR [r10+r11+8] + + movd eax, xmm6 + movd edx, xmm7 + pextrd r9d, xmm7, 2 + +CryptonightWOW_template_part2: + mov rax, r13 + mul r12 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + mov r9d, r10d + mov r12d, r10d + xor r9d, 16 + xor r12d, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [r12+r11] + xor rdx, QWORD PTR [r12+r11] + xor rax, QWORD PTR [r11+r12+8] + movdqa xmm2, XMMWORD PTR [r9+r11] + pxor xmm3, xmm2 + paddq xmm7, XMMWORD PTR [r10+r11] + paddq xmm1, xmm4 + paddq xmm3, xmm6 + movdqu XMMWORD PTR [r9+r11], xmm7 + movdqu XMMWORD PTR [r12+r11], xmm3 + movdqu XMMWORD PTR [r10+r11], xmm1 + + movdqa xmm7, xmm6 + add r15, rax + add rsp, rdx + xor r10, 48 + mov QWORD PTR [r10+r11], rsp + xor rsp, r13 + mov r9d, esp + mov QWORD PTR [r10+r11+8], r15 + and r9d, 2097136 + xor r15, r14 + movdqa xmm6, xmm5 + dec r8d + jnz CryptonightWOW_template_mainloop + +CryptonightWOW_template_part3: + movd rsp, xmm9 + + mov rbx, QWORD PTR [rsp+136] + mov rbp, QWORD PTR [rsp+144] + mov rsi, QWORD PTR [rsp+152] + movaps xmm6, XMMWORD PTR [rsp+48] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+16] + movaps xmm9, XMMWORD PTR [rsp] + add rsp, 64 + pop rdi + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + ret 0 +CryptonightWOW_template_end: + +ALIGN(64) +CryptonightWOW_template_double_part1: + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 320 + mov r14, QWORD PTR [rcx+32] + mov r8, rcx + xor r14, QWORD PTR [rcx] + mov r12, QWORD PTR [rcx+40] + mov ebx, r14d + mov rsi, QWORD PTR [rcx+224] + and ebx, 2097136 + xor r12, QWORD PTR [rcx+8] + mov rcx, QWORD PTR [rcx+56] + xor rcx, QWORD PTR [r8+24] + mov rax, QWORD PTR [r8+48] + xor rax, QWORD PTR [r8+16] + mov r15, QWORD PTR [rdx+32] + xor r15, QWORD PTR [rdx] + movd xmm0, rcx + mov rcx, QWORD PTR [r8+88] + xor rcx, QWORD PTR [r8+72] + mov r13, QWORD PTR [rdx+40] + mov rdi, QWORD PTR [rdx+224] + xor r13, QWORD PTR [rdx+8] + movaps XMMWORD PTR [rsp+160], xmm6 + movaps XMMWORD PTR [rsp+176], xmm7 + movaps XMMWORD PTR [rsp+192], xmm8 + movaps XMMWORD PTR [rsp+208], xmm9 + movaps XMMWORD PTR [rsp+224], xmm10 + movaps XMMWORD PTR [rsp+240], xmm11 + movaps XMMWORD PTR [rsp+256], xmm12 + movaps XMMWORD PTR [rsp+272], xmm13 + movaps XMMWORD PTR [rsp+288], xmm14 + movaps XMMWORD PTR [rsp+304], xmm15 + movd xmm7, rax + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + + movaps xmm1, XMMWORD PTR [rdx+96] + movaps xmm2, XMMWORD PTR [r8+96] + movaps XMMWORD PTR [rsp], xmm1 + movaps XMMWORD PTR [rsp+16], xmm2 + + mov r8d, r15d + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+56] + xor rcx, QWORD PTR [rdx+24] + movd xmm9, rax + mov QWORD PTR [rsp+128], rsi + mov rax, QWORD PTR [rdx+48] + xor rax, QWORD PTR [rdx+16] + punpcklqdq xmm9, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [rdx+88] + xor rcx, QWORD PTR [rdx+72] + movd xmm8, rax + mov QWORD PTR [rsp+136], rdi + mov rax, QWORD PTR [rdx+80] + xor rax, QWORD PTR [rdx+64] + punpcklqdq xmm8, xmm0 + and r8d, 2097136 + movd xmm0, rcx + mov r11d, 524288 + movd xmm10, rax + punpcklqdq xmm10, xmm0 + + movd xmm14, QWORD PTR [rsp+128] + movd xmm15, QWORD PTR [rsp+136] + + ALIGN(64) +CryptonightWOW_template_double_mainloop: + movdqu xmm6, XMMWORD PTR [rbx+rsi] + movd xmm0, r12 + mov ecx, ebx + movd xmm3, r14 + punpcklqdq xmm3, xmm0 + xor ebx, 16 + aesenc xmm6, xmm3 + movd rdx, xmm6 + movd xmm4, r15 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + xor ebx, 48 + paddq xmm0, xmm7 + movdqu xmm1, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm0 + paddq xmm1, xmm3 + xor ebx, 16 + mov eax, ebx + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbx+rsi] + movdqu XMMWORD PTR [rbx+rsi], xmm1 + paddq xmm0, xmm9 + movdqu XMMWORD PTR [rax+rsi], xmm0 + movdqa xmm0, xmm6 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [rcx+rsi], xmm0 + mov esi, edx + movdqu xmm5, XMMWORD PTR [r8+rdi] + and esi, 2097136 + mov ecx, r8d + movd xmm0, r13 + punpcklqdq xmm4, xmm0 + xor r8d, 16 + aesenc xmm5, xmm4 + movdqu xmm0, XMMWORD PTR [r8+rdi] + xor r8d, 48 + paddq xmm0, xmm8 + movdqu xmm1, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm0 + paddq xmm1, xmm4 + xor r8d, 16 + mov eax, r8d + xor rax, 32 + movdqu xmm0, XMMWORD PTR [r8+rdi] + movdqu XMMWORD PTR [r8+rdi], xmm1 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rdi], xmm0 + movdqa xmm0, xmm5 + pxor xmm0, xmm8 + movdqu XMMWORD PTR [rcx+rdi], xmm0 + movd rdi, xmm5 + movd rcx, xmm14 + mov ebp, edi + mov r8, QWORD PTR [rcx+rsi] + mov r10, QWORD PTR [rcx+rsi+8] + lea r9, QWORD PTR [rcx+rsi] + xor esi, 16 + + movd xmm0, rsp + movd xmm1, rsi + movd xmm2, rdi + movd xmm11, rbp + movd xmm12, r15 + movd xmm13, rdx + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp+16] + mov esi, DWORD PTR [rsp+20] + mov edi, DWORD PTR [rsp+24] + mov ebp, DWORD PTR [rsp+28] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + xor r8, rax + + movd esp, xmm3 + pextrd r15d, xmm3, 2 + movd eax, xmm7 + movd edx, xmm9 + pextrd r9d, xmm9, 2 + +CryptonightWOW_template_double_part2: + + movd rsp, xmm0 + mov DWORD PTR [rsp+16], ebx + mov DWORD PTR [rsp+20], esi + mov DWORD PTR [rsp+24], edi + mov DWORD PTR [rsp+28], ebp + + movd rsi, xmm1 + movd rdi, xmm2 + movd rbp, xmm11 + movd r15, xmm12 + movd rdx, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rbx, r8 + mov rax, r8 + mul rdx + and ebp, 2097136 + mov r8, rax + movd xmm1, rdx + movd xmm0, r8 + punpcklqdq xmm1, xmm0 + pxor xmm1, XMMWORD PTR [rcx+rsi] + xor esi, 48 + paddq xmm1, xmm7 + movdqu xmm2, XMMWORD PTR [rsi+rcx] + xor rdx, QWORD PTR [rsi+rcx] + paddq xmm2, xmm3 + xor r8, QWORD PTR [rsi+rcx+8] + movdqu XMMWORD PTR [rsi+rcx], xmm1 + xor esi, 16 + mov eax, esi + mov rsi, rcx + movdqu xmm0, XMMWORD PTR [rax+rcx] + movdqu XMMWORD PTR [rax+rcx], xmm2 + paddq xmm0, xmm9 + add r12, r8 + xor rax, 32 + add r14, rdx + movdqa xmm9, xmm7 + movdqa xmm7, xmm6 + movdqu XMMWORD PTR [rax+rcx], xmm0 + mov QWORD PTR [r9+8], r12 + xor r12, r10 + mov QWORD PTR [r9], r14 + movd rcx, xmm15 + xor r14, rbx + mov r10d, ebp + mov ebx, r14d + xor ebp, 16 + and ebx, 2097136 + mov r8, QWORD PTR [r10+rcx] + mov r9, QWORD PTR [r10+rcx+8] + + movd xmm0, rsp + movd xmm1, rbx + movd xmm2, rsi + movd xmm11, rdi + movd xmm12, rbp + movd xmm13, r15 + mov [rsp+104], rcx + mov [rsp+112], r9 + + mov ebx, DWORD PTR [rsp] + mov esi, DWORD PTR [rsp+4] + mov edi, DWORD PTR [rsp+8] + mov ebp, DWORD PTR [rsp+12] + + lea eax, [ebx+esi] + lea edx, [edi+ebp] + shl rdx, 32 + or rax, rdx + + xor r8, rax + movd xmm3, r8 + + movd esp, xmm4 + pextrd r15d, xmm4, 2 + movd eax, xmm8 + movd edx, xmm10 + pextrd r9d, xmm10, 2 + +CryptonightWOW_template_double_part3: + + movd rsp, xmm0 + mov DWORD PTR [rsp], ebx + mov DWORD PTR [rsp+4], esi + mov DWORD PTR [rsp+8], edi + mov DWORD PTR [rsp+12], ebp + + movd rbx, xmm1 + movd rsi, xmm2 + movd rdi, xmm11 + movd rbp, xmm12 + movd r15, xmm13 + mov rcx, [rsp+104] + mov r9, [rsp+112] + + mov rax, r8 + mul rdi + movd xmm1, rdx + movd xmm0, rax + punpcklqdq xmm1, xmm0 + mov rdi, rcx + mov r8, rax + pxor xmm1, XMMWORD PTR [rbp+rcx] + xor ebp, 48 + paddq xmm1, xmm8 + xor r8, QWORD PTR [rbp+rcx+8] + xor rdx, QWORD PTR [rbp+rcx] + add r13, r8 + movdqu xmm2, XMMWORD PTR [rbp+rcx] + add r15, rdx + movdqu XMMWORD PTR [rbp+rcx], xmm1 + paddq xmm2, xmm4 + xor ebp, 16 + mov eax, ebp + xor rax, 32 + movdqu xmm0, XMMWORD PTR [rbp+rcx] + movdqu XMMWORD PTR [rbp+rcx], xmm2 + paddq xmm0, xmm10 + movdqu XMMWORD PTR [rax+rcx], xmm0 + movd rax, xmm3 + movdqa xmm10, xmm8 + mov QWORD PTR [r10+rcx], r15 + movdqa xmm8, xmm5 + xor r15, rax + mov QWORD PTR [r10+rcx+8], r13 + mov r8d, r15d + xor r13, r9 + and r8d, 2097136 + dec r11d + jnz CryptonightWOW_template_double_mainloop + +CryptonightWOW_template_double_part4: + + mov rbx, QWORD PTR [rsp+400] + movaps xmm6, XMMWORD PTR [rsp+160] + movaps xmm7, XMMWORD PTR [rsp+176] + movaps xmm8, XMMWORD PTR [rsp+192] + movaps xmm9, XMMWORD PTR [rsp+208] + movaps xmm10, XMMWORD PTR [rsp+224] + movaps xmm11, XMMWORD PTR [rsp+240] + movaps xmm12, XMMWORD PTR [rsp+256] + movaps xmm13, XMMWORD PTR [rsp+272] + movaps xmm14, XMMWORD PTR [rsp+288] + movaps xmm15, XMMWORD PTR [rsp+304] + add rsp, 320 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + ret 0 +CryptonightWOW_template_double_end: diff --git a/src/crypto/asm/win/cn_main_loop.asm b/src/crypto/asm/win/cn_main_loop.asm index a23addd9..e62d1124 100644 --- a/src/crypto/asm/win/cn_main_loop.asm +++ b/src/crypto/asm/win/cn_main_loop.asm @@ -21,6 +21,19 @@ PUBLIC cnv2_main_loop_ultralite_ryzen_asm PUBLIC cnv2_main_loop_ultralite_bulldozer_asm PUBLIC cnv2_double_main_loop_ultralite_sandybridge_asm +PUBLIC cnv2_main_loop_xcash_ivybridge_asm +PUBLIC cnv2_main_loop_xcash_ryzen_asm +PUBLIC cnv2_main_loop_xcash_bulldozer_asm +PUBLIC cnv2_double_main_loop_xcash_sandybridge_asm + +PUBLIC cnv2_main_loop_zelerius_ivybridge_asm +PUBLIC cnv2_main_loop_zelerius_ryzen_asm +PUBLIC cnv2_main_loop_zelerius_bulldozer_asm +PUBLIC cnv2_double_main_loop_zelerius_sandybridge_asm + +PUBLIC cnv2_main_loop_rwz_all_asm +PUBLIC cnv2_double_main_loop_rwz_all_asm + PUBLIC cnv1_main_loop_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_lite_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_fast_soft_aes_sandybridge_asm @@ -30,6 +43,8 @@ PUBLIC cnv1_main_loop_rto_soft_aes_sandybridge_asm PUBLIC cnv2_main_loop_soft_aes_sandybridge_asm PUBLIC cnv2_main_loop_fastv2_soft_aes_sandybridge_asm PUBLIC cnv2_main_loop_ultralite_soft_aes_sandybridge_asm +PUBLIC cnv2_main_loop_xcash_soft_aes_sandybridge_asm +PUBLIC cnv2_main_loop_zelerius_soft_aes_sandybridge_asm ALIGN 64 cnv1_main_loop_sandybridge_asm PROC @@ -133,6 +148,66 @@ cnv2_double_main_loop_ultralite_sandybridge_asm PROC ret 0 cnv2_double_main_loop_ultralite_sandybridge_asm ENDP +ALIGN 64 +cnv2_main_loop_xcash_ivybridge_asm PROC + INCLUDE cnv2_main_loop_xcash_ivybridge.inc + ret 0 +cnv2_main_loop_xcash_ivybridge_asm ENDP + +ALIGN 64 +cnv2_main_loop_xcash_ryzen_asm PROC + INCLUDE cnv2_main_loop_xcash_ryzen.inc + ret 0 +cnv2_main_loop_xcash_ryzen_asm ENDP + +ALIGN 64 +cnv2_main_loop_xcash_bulldozer_asm PROC + INCLUDE cnv2_main_loop_xcash_bulldozer.inc + ret 0 +cnv2_main_loop_xcash_bulldozer_asm ENDP + +ALIGN 64 +cnv2_double_main_loop_xcash_sandybridge_asm PROC + INCLUDE cnv2_double_main_loop_xcash_sandybridge.inc + ret 0 +cnv2_double_main_loop_xcash_sandybridge_asm ENDP + +ALIGN 64 +cnv2_main_loop_zelerius_ivybridge_asm PROC + INCLUDE cnv2_main_loop_zelerius_ivybridge.inc + ret 0 +cnv2_main_loop_zelerius_ivybridge_asm ENDP + +ALIGN 64 +cnv2_main_loop_zelerius_ryzen_asm PROC + INCLUDE cnv2_main_loop_zelerius_ryzen.inc + ret 0 +cnv2_main_loop_zelerius_ryzen_asm ENDP + +ALIGN 64 +cnv2_main_loop_zelerius_bulldozer_asm PROC + INCLUDE cnv2_main_loop_zelerius_bulldozer.inc + ret 0 +cnv2_main_loop_zelerius_bulldozer_asm ENDP + +ALIGN 64 +cnv2_double_main_loop_zelerius_sandybridge_asm PROC + INCLUDE cnv2_double_main_loop_zelerius_sandybridge.inc + ret 0 +cnv2_double_main_loop_zelerius_sandybridge_asm ENDP + +ALIGN 64 +cnv2_main_loop_rwz_all_asm PROC + INCLUDE cnv2_main_loop_rwz_all.inc + ret 0 +cnv2_main_loop_rwz_all_asm ENDP + +ALIGN 64 +cnv2_double_main_loop_rwz_all_asm PROC + INCLUDE cnv2_double_main_loop_rwz_all.inc + ret 0 +cnv2_double_main_loop_rwz_all_asm ENDP + ALIGN 64 cnv1_main_loop_soft_aes_sandybridge_asm PROC INCLUDE cnv1_main_loop_soft_aes_sandybridge.inc @@ -181,5 +256,17 @@ cnv2_main_loop_ultralite_soft_aes_sandybridge_asm PROC ret 0 cnv2_main_loop_ultralite_soft_aes_sandybridge_asm ENDP +ALIGN 64 +cnv2_main_loop_xcash_soft_aes_sandybridge_asm PROC + INCLUDE cnv2_main_loop_xcash_soft_aes_sandybridge.inc + ret 0 +cnv2_main_loop_xcash_soft_aes_sandybridge_asm ENDP + +ALIGN 64 +cnv2_main_loop_zelerius_soft_aes_sandybridge_asm PROC + INCLUDE cnv2_main_loop_zelerius_soft_aes_sandybridge.inc + ret 0 +cnv2_main_loop_zelerius_soft_aes_sandybridge_asm ENDP + _TEXT_CN_MAINLOOP ENDS END \ No newline at end of file diff --git a/src/crypto/asm/win/cn_main_loop_win_gcc.S b/src/crypto/asm/win/cn_main_loop_win_gcc.S index 7bf3c668..ace49b54 100644 --- a/src/crypto/asm/win/cn_main_loop_win_gcc.S +++ b/src/crypto/asm/win/cn_main_loop_win_gcc.S @@ -24,6 +24,19 @@ .global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm) .global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_xcash_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_zelerius_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_zelerius_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_rwz_allr_asm) +.global FN_PREFIX(cnv2_double_main_loop_rwz_all_asm) + .global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm) @@ -33,6 +46,8 @@ .global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_xcash_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_zelerius_soft_aes_sandybridge_asm) ALIGN 64 FN_PREFIX(cnv1_main_loop_sandybridge_asm): @@ -119,6 +134,56 @@ FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm): #include "../cnv2_double_main_loop_ultralite_sandybridge.inc" ret 0 +ALIGN 64 +FN_PREFIX(cnv2_main_loop_xcash_ivybridge_asm): + #include "../cnv2_main_loop_xcash_ivybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_xcash_ryzen_asm): + #include "../cnv2_main_loop_xcash_ryzen.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_xcash_bulldozer_asm): + #include "../cnv2_main_loop_xcash_bulldozer.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_double_main_loop_xcash_sandybridge_asm): + #include "../cnv2_double_main_loop_xcash_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_zelerius_ivybridge_asm): + #include "../cnv2_main_loop_zelerius_ivybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_zelerius_ryzen_asm): + #include "../cnv2_main_loop_zelerius_ryzen.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm): + #include "../cnv2_main_loop_zelerius_bulldozer.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm): + #include "../cnv2_double_main_loop_zelerius_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_rwz_allr_asm): + #include "../cnv2_main_loop_rwz_allr.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_double_main_loop_rwz_all_asm): + #include "../cnv2_double_main_loop_rwz_all.inc" + ret 0 + ALIGN 64 FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm): #include "../cnv1_main_loop_soft_aes_sandybridge.inc" @@ -157,4 +222,14 @@ FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm): ALIGN 64 FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm): #include "../cnv2_main_loop_ultralite_soft_aes_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_xcash_soft_aes_sandybridge_asm): + #include "../cnv2_main_loop_xcash_soft_aes_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_zelerius_soft_aes_sandybridge_asm): + #include "../cnv2_main_loop_zelerius_soft_aes_sandybridge.inc" ret 0 \ No newline at end of file diff --git a/src/crypto/asm/win/cnv2_double_main_loop_rwz_all.inc b/src/crypto/asm/win/cnv2_double_main_loop_rwz_all.inc new file mode 100644 index 00000000..69ca8793 --- /dev/null +++ b/src/crypto/asm/win/cnv2_double_main_loop_rwz_all.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 393216 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movd xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movd xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movd xmm5, QWORD PTR [r8+104] + movd xmm7, rax + + mov eax, 1 + shl rax, 52 + movd xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movd xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movd xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movd xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movd xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movd xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movd xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movd xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN(64) +rwz_main_loop_double: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movd xmm0, r11 + movd xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movd r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movd xmm0, rbp + movd xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rax+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movd rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movd rdx, xmm5 + shl rdx, 32 + movd rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movd xmm0, rdx + xor rdx, [r11+r13] + movd xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm3 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r8+r13], xmm0 + xor r8d, 32 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm7 + movdqu XMMWORD PTR [r11+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movd r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movd r11, xmm0 + psrldq xmm1, 8 + movd r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movd rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movd rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movd r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js rwz_div_fix_1 +rwz_div_fix_1_ret: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js rwz_div_fix_2 +rwz_div_fix_2_ret: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movd r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je rwz_sqrt_fix_1 +rwz_sqrt_fix_1_ret: + + movd r9, xmm10 + psrldq xmm1, 8 + movd r8, xmm1 + test r8, 524287 + je rwz_sqrt_fix_2 +rwz_sqrt_fix_2_ret: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movd xmm0, rax + movd xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm3, xmm6 + paddq xmm1, xmm11 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm3 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm0 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne rwz_main_loop_double + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp rwz_cnv2_double_mainloop_asm_endp + +rwz_div_fix_1: + dec rbx + add r11, rdx + jmp rwz_div_fix_1_ret + +rwz_div_fix_2: + dec rdx + add r8, r9 + jmp rwz_div_fix_2_ret + +rwz_sqrt_fix_1: + movd r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movd xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_1_ret + +rwz_sqrt_fix_2: + psrldq xmm3, 8 + movd r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movd xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp rwz_sqrt_fix_2_ret + +rwz_cnv2_double_mainloop_asm_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_rwz_all.inc b/src/crypto/asm/win/cnv2_main_loop_rwz_all.inc new file mode 100644 index 00000000..99317730 --- /dev/null +++ b/src/crypto/asm/win/cnv2_main_loop_rwz_all.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 393216 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movd xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movd xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movd xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movd xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN(64) +rwz_main_loop: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movd xmm0, r11 + movd xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movd rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm0, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm2, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + paddq xmm1, xmm7 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movd rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movd rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movd xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movd rdx, xmm3 + test edx, 524287 + je rwz_sqrt_fixup + psrlq xmm3, 19 +rwz_sqrt_fixup_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movd xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movd xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm4 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm5 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm2 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne rwz_main_loop + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_rwz_main_loop_endp + +rwz_sqrt_fixup: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movd xmm3, rdx + jmp rwz_sqrt_fixup_ret + +cnv2_rwz_main_loop_endp: diff --git a/src/crypto/variant4_random_math.h b/src/crypto/variant4_random_math.h new file mode 100644 index 00000000..3bbcdc5b --- /dev/null +++ b/src/crypto/variant4_random_math.h @@ -0,0 +1,447 @@ +#ifndef VARIANT4_RANDOM_MATH_H +#define VARIANT4_RANDOM_MATH_H + +extern "C" +{ +#include "c_blake256.h" +} + +enum V4_Settings +{ + // Generate code with minimal theoretical latency = 45 cycles, which is equivalent to 15 multiplications + TOTAL_LATENCY = 15 * 3, + + // Always generate at least 60 instructions + NUM_INSTRUCTIONS_MIN = 60, + + // Never generate more than 70 instructions (final RET instruction doesn't count here) + NUM_INSTRUCTIONS_MAX = 70, + + // Available ALUs for MUL + // Modern CPUs typically have only 1 ALU which can do multiplications + ALU_COUNT_MUL = 1, + + // Total available ALUs + // Modern CPUs have 4 ALUs, but we use only 3 because random math executes together with other main loop code + ALU_COUNT = 3, +}; + +enum V4_InstructionList +{ + MUL, // a*b + ADD, // a+b + C, C is an unsigned 32-bit constant + SUB, // a-b + ROR, // rotate right "a" by "b & 31" bits + ROL, // rotate left "a" by "b & 31" bits + XOR, // a^b + RET, // finish execution + V4_INSTRUCTION_COUNT = RET, +}; + +// V4_InstructionDefinition is used to generate code from random data +// Every random sequence of bytes is a valid code +// +// There are 9 registers in total: +// - 4 variable registers +// - 5 constant registers initialized from loop variables +// This is why dst_index is 2 bits +enum V4_InstructionDefinition +{ + V4_OPCODE_BITS = 3, + V4_DST_INDEX_BITS = 2, + V4_SRC_INDEX_BITS = 3, +}; + +struct V4_Instruction +{ + uint8_t opcode; + uint8_t dst_index; + uint8_t src_index; + uint32_t C; +}; + +#ifndef FORCEINLINE +#ifdef __GNUC__ +#define FORCEINLINE __attribute__((always_inline)) inline +#elif _MSC_VER +#define FORCEINLINE __forceinline +#else +#define FORCEINLINE inline +#endif +#endif + +#ifndef UNREACHABLE_CODE +#ifdef __GNUC__ +#define UNREACHABLE_CODE __builtin_unreachable() +#elif _MSC_VER +#define UNREACHABLE_CODE __assume(false) +#else +#define UNREACHABLE_CODE +#endif +#endif + +// Random math interpreter's loop is fully unrolled and inlined to achieve 100% branch prediction on CPU: +// every switch-case will point to the same destination on every iteration of Cryptonight main loop +// +// This is about as fast as it can get without using low-level machine code generation +template +static void v4_random_math(const struct V4_Instruction* code, v4_reg* r) +{ + enum + { + REG_BITS = sizeof(v4_reg) * 8, + }; + +#define V4_EXEC(i) \ + { \ + const struct V4_Instruction* op = code + i; \ + const v4_reg src = r[op->src_index]; \ + v4_reg* dst = r + op->dst_index; \ + switch (op->opcode) \ + { \ + case MUL: \ + *dst *= src; \ + break; \ + case ADD: \ + *dst += src + op->C; \ + break; \ + case SUB: \ + *dst -= src; \ + break; \ + case ROR: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst >> shift) | (*dst << ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case ROL: \ + { \ + const uint32_t shift = src % REG_BITS; \ + *dst = (*dst << shift) | (*dst >> ((REG_BITS - shift) % REG_BITS)); \ + } \ + break; \ + case XOR: \ + *dst ^= src; \ + break; \ + case RET: \ + return; \ + default: \ + UNREACHABLE_CODE; \ + break; \ + } \ + } + +#define V4_EXEC_10(j) \ + V4_EXEC(j + 0) \ + V4_EXEC(j + 1) \ + V4_EXEC(j + 2) \ + V4_EXEC(j + 3) \ + V4_EXEC(j + 4) \ + V4_EXEC(j + 5) \ + V4_EXEC(j + 6) \ + V4_EXEC(j + 7) \ + V4_EXEC(j + 8) \ + V4_EXEC(j + 9) + + // Generated program can have 60 + a few more (usually 2-3) instructions to achieve required latency + // I've checked all block heights < 10,000,000 and here is the distribution of program sizes: + // + // 60 27960 + // 61 105054 + // 62 2452759 + // 63 5115997 + // 64 1022269 + // 65 1109635 + // 66 153145 + // 67 8550 + // 68 4529 + // 69 102 + + // Unroll 70 instructions here + V4_EXEC_10(0); // instructions 0-9 + V4_EXEC_10(10); // instructions 10-19 + V4_EXEC_10(20); // instructions 20-29 + V4_EXEC_10(30); // instructions 30-39 + V4_EXEC_10(40); // instructions 40-49 + V4_EXEC_10(50); // instructions 50-59 + V4_EXEC_10(60); // instructions 60-69 + +#undef V4_EXEC_10 +#undef V4_EXEC +} + +// If we don't have enough data available, generate more +static FORCEINLINE void check_data(size_t* data_index, const size_t bytes_needed, int8_t* data, const size_t data_size) +{ + if (*data_index + bytes_needed > data_size) + { + hash_extra_blake(data, data_size, (char*) data); + *data_index = 0; + } +} + +// Generates as many random math operations as possible with given latency and ALU restrictions +// "code" array must have space for NUM_INSTRUCTIONS_MAX+1 instructions +static int v4_random_math_init(struct V4_Instruction* code, PowVariant variant, const uint64_t height) +{ + // MUL is 3 cycles, 3-way addition and rotations are 2 cycles, SUB/XOR are 1 cycle + // These latencies match real-life instruction latencies for Intel CPUs starting from Sandy Bridge and up to Skylake/Coffee lake + // + // AMD Ryzen has the same latencies except 1-cycle ROR/ROL, so it'll be a bit faster than Intel Sandy Bridge and newer processors + // Surprisingly, Intel Nehalem also has 1-cycle ROR/ROL, so it'll also be faster than Intel Sandy Bridge and newer processors + // AMD Bulldozer has 4 cycles latency for MUL (slower than Intel) and 1 cycle for ROR/ROL (faster than Intel), so average performance will be the same + // Source: https://www.agner.org/optimize/instruction_tables.pdf + const int op_latency[V4_INSTRUCTION_COUNT] = { 3, 2, 1, 2, 2, 1 }; + + // Instruction latencies for theoretical ASIC implementation + const int asic_op_latency[V4_INSTRUCTION_COUNT] = { 3, 1, 1, 1, 1, 1 }; + + // Available ALUs for each instruction + const int op_ALUs[V4_INSTRUCTION_COUNT] = { ALU_COUNT_MUL, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT, ALU_COUNT }; + + int8_t data[32]; + memset(data, 0, sizeof(data)); + uint64_t tmp = SWAP64LE(height); + memcpy(data, &tmp, sizeof(uint64_t)); + if (variant == POW_V4) + { + data[20] = -38; + } + + // Set data_index past the last byte in data + // to trigger full data update with blake hash + // before we start using it + size_t data_index = sizeof(data); + + int code_size; + + // There is a small chance (1.8%) that register R8 won't be used in the generated program + // So we keep track of it and try again if it's not used + bool r8_used; + do { + int latency[9]; + int asic_latency[9]; + + // Tracks previous instruction and value of the source operand for registers R0-R3 throughout code execution + // byte 0: current value of the destination register + // byte 1: instruction opcode + // byte 2: current value of the source register + // + // Registers R4-R8 are constant and are treated as having the same value because when we do + // the same operation twice with two constant source registers, it can be optimized into a single operation + uint32_t inst_data[9] = { 0, 1, 2, 3, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF, 0xFFFFFF }; + + bool alu_busy[TOTAL_LATENCY + 1][ALU_COUNT]; + bool is_rotation[V4_INSTRUCTION_COUNT]; + bool rotated[4]; + int rotate_count = 0; + + memset(latency, 0, sizeof(latency)); + memset(asic_latency, 0, sizeof(asic_latency)); + memset(alu_busy, 0, sizeof(alu_busy)); + memset(is_rotation, 0, sizeof(is_rotation)); + memset(rotated, 0, sizeof(rotated)); + is_rotation[ROR] = true; + is_rotation[ROL] = true; + + int num_retries = 0; + code_size = 0; + + int total_iterations = 0; + r8_used = (variant == POW_WOW); + + // Generate random code to achieve minimal required latency for our abstract CPU + // Try to get this latency for all 4 registers + while (((latency[0] < TOTAL_LATENCY) || (latency[1] < TOTAL_LATENCY) || (latency[2] < TOTAL_LATENCY) || (latency[3] < TOTAL_LATENCY)) && (num_retries < 64)) + { + // Fail-safe to guarantee loop termination + ++total_iterations; + if (total_iterations > 256) + break; + + check_data(&data_index, 1, data, sizeof(data)); + + const uint8_t c = ((uint8_t*)data)[data_index++]; + + // MUL = opcodes 0-2 + // ADD = opcode 3 + // SUB = opcode 4 + // ROR/ROL = opcode 5, shift direction is selected randomly + // XOR = opcodes 6-7 + uint8_t opcode = c & ((1 << V4_OPCODE_BITS) - 1); + if (opcode == 5) + { + check_data(&data_index, 1, data, sizeof(data)); + opcode = (data[data_index++] >= 0) ? ROR : ROL; + } + else if (opcode >= 6) + { + opcode = XOR; + } + else + { + opcode = (opcode <= 2) ? MUL : (opcode - 2); + } + + uint8_t dst_index = (c >> V4_OPCODE_BITS) & ((1 << V4_DST_INDEX_BITS) - 1); + uint8_t src_index = (c >> (V4_OPCODE_BITS + V4_DST_INDEX_BITS)) & ((1 << V4_SRC_INDEX_BITS) - 1); + + const int a = dst_index; + int b = src_index; + + // Don't do ADD/SUB/XOR with the same register + if (((opcode == ADD) || (opcode == SUB) || (opcode == XOR)) && (a == b)) + { + // a is always < 4, so we don't need to check bounds here + b = (variant == POW_WOW) ? (a + 4) : 8; + src_index = b; + } + + // Don't do rotation with the same destination twice because it's equal to a single rotation + if (is_rotation[opcode] && rotated[a]) + { + continue; + } + + // Don't do the same instruction (except MUL) with the same source value twice because all other cases can be optimized: + // 2xADD(a, b, C) = ADD(a, b*2, C1+C2), same for SUB and rotations + // 2xXOR(a, b) = NOP + if ((opcode != MUL) && ((inst_data[a] & 0xFFFF00) == (opcode << 8) + ((inst_data[b] & 255) << 16))) + { + continue; + } + + // Find which ALU is available (and when) for this instruction + int next_latency = (latency[a] > latency[b]) ? latency[a] : latency[b]; + int alu_index = -1; + while (next_latency < TOTAL_LATENCY) + { + for (int i = op_ALUs[opcode] - 1; i >= 0; --i) + { + if (!alu_busy[next_latency][i]) + { + // ADD is implemented as two 1-cycle instructions on a real CPU, so do an additional availability check + if ((opcode == ADD) && alu_busy[next_latency + 1][i]) + { + continue; + } + + // Rotation can only start when previous rotation is finished, so do an additional availability check + if (is_rotation[opcode] && (next_latency < rotate_count * op_latency[opcode])) + { + continue; + } + + alu_index = i; + break; + } + } + if (alu_index >= 0) + { + break; + } + ++next_latency; + } + + // Don't generate instructions that leave some register unchanged for more than 7 cycles + if (next_latency > latency[a] + 7) + { + continue; + } + + next_latency += op_latency[opcode]; + + if (next_latency <= TOTAL_LATENCY) + { + if (is_rotation[opcode]) + { + ++rotate_count; + } + + // Mark ALU as busy only for the first cycle when it starts executing the instruction because ALUs are fully pipelined + alu_busy[next_latency - op_latency[opcode]][alu_index] = true; + latency[a] = next_latency; + + // ASIC is supposed to have enough ALUs to run as many independent instructions per cycle as possible, so latency calculation for ASIC is simple + asic_latency[a] = ((asic_latency[a] > asic_latency[b]) ? asic_latency[a] : asic_latency[b]) + asic_op_latency[opcode]; + + rotated[a] = is_rotation[opcode]; + + inst_data[a] = code_size + (opcode << 8) + ((inst_data[b] & 255) << 16); + + code[code_size].opcode = opcode; + code[code_size].dst_index = dst_index; + code[code_size].src_index = src_index; + code[code_size].C = 0; + + if (src_index == 8) + { + r8_used = true; + } + + if (opcode == ADD) + { + // ADD instruction is implemented as two 1-cycle instructions on a real CPU, so mark ALU as busy for the next cycle too + alu_busy[next_latency - op_latency[opcode] + 1][alu_index] = true; + + // ADD instruction requires 4 more random bytes for 32-bit constant "C" in "a = a + b + C" + check_data(&data_index, sizeof(uint32_t), data, sizeof(data)); + uint32_t t; + memcpy(&t, data + data_index, sizeof(uint32_t)); + code[code_size].C = SWAP32LE(t); + data_index += sizeof(uint32_t); + } + + ++code_size; + if (code_size >= NUM_INSTRUCTIONS_MIN) + { + break; + } + } + else + { + ++num_retries; + } + } + + // ASIC has more execution resources and can extract as much parallelism from the code as possible + // We need to add a few more MUL and ROR instructions to achieve minimal required latency for ASIC + // Get this latency for at least 1 of the 4 registers + const int prev_code_size = code_size; + while ((code_size < NUM_INSTRUCTIONS_MAX) && (asic_latency[0] < TOTAL_LATENCY) && (asic_latency[1] < TOTAL_LATENCY) && (asic_latency[2] < TOTAL_LATENCY) && (asic_latency[3] < TOTAL_LATENCY)) + { + int min_idx = 0; + int max_idx = 0; + for (int i = 1; i < 4; ++i) + { + if (asic_latency[i] < asic_latency[min_idx]) min_idx = i; + if (asic_latency[i] > asic_latency[max_idx]) max_idx = i; + } + + const uint8_t pattern[3] = { ROR, MUL, MUL }; + const uint8_t opcode = pattern[(code_size - prev_code_size) % 3]; + latency[min_idx] = latency[max_idx] + op_latency[opcode]; + asic_latency[min_idx] = asic_latency[max_idx] + asic_op_latency[opcode]; + + code[code_size].opcode = opcode; + code[code_size].dst_index = min_idx; + code[code_size].src_index = max_idx; + code[code_size].C = 0; + ++code_size; + } + + // There is ~98.15% chance that loop condition is false, so this loop will execute only 1 iteration most of the time + // It never does more than 4 iterations for all block heights < 10,000,000 + } while (!r8_used || (code_size < NUM_INSTRUCTIONS_MIN) || (code_size > NUM_INSTRUCTIONS_MAX)); + + // It's guaranteed that NUM_INSTRUCTIONS_MIN <= code_size <= NUM_INSTRUCTIONS_MAX here + // Add final instruction to stop the interpreter + code[code_size].opcode = RET; + code[code_size].dst_index = 0; + code[code_size].src_index = 0; + code[code_size].C = 0; + + return code_size; +} + +#endif \ No newline at end of file diff --git a/src/net/Client.cpp b/src/net/Client.cpp index 21e1c076..4a03780b 100644 --- a/src/net/Client.cpp +++ b/src/net/Client.cpp @@ -260,6 +260,14 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code) job.setPowVariant(powVariant); + if (params.HasMember("height")) { + const rapidjson::Value &variant = params["height"]; + + if (variant.IsUint64()) { + job.setHeight(variant.GetUint64()); + } + } + if (m_job != job) { m_jobs++; m_job = std::move(job); diff --git a/src/net/Job.cpp b/src/net/Job.cpp index 9835974c..4f162d00 100644 --- a/src/net/Job.cpp +++ b/src/net/Job.cpp @@ -62,6 +62,7 @@ Job::Job(int poolId, bool nicehash) : m_size(0), m_diff(0), m_target(0), + m_height(0), m_powVariant(PowVariant::POW_AUTODETECT) { } diff --git a/src/net/Job.h b/src/net/Job.h index 74fc5dc8..62ac26d3 100644 --- a/src/net/Job.h +++ b/src/net/Job.h @@ -54,9 +54,11 @@ public: inline uint32_t *nonce() { return reinterpret_cast(m_blob + 39); } inline uint32_t diff() const { return (uint32_t) m_diff; } inline uint64_t target() const { return m_target; } + inline uint64_t height() const { return m_height; } inline void setNicehash(bool nicehash) { m_nicehash = nicehash; } inline void setThreadId(int threadId) { m_threadId = threadId; } inline void setPowVariant(PowVariant powVariant) { m_powVariant = powVariant; } + inline void setHeight(uint64_t height) { m_height = height; } static bool fromHex(const char* in, unsigned int len, unsigned char* out); static inline uint32_t *nonce(uint8_t *blob) { return reinterpret_cast(blob + 39); } @@ -76,6 +78,7 @@ private: size_t m_size; uint64_t m_diff; uint64_t m_target; + uint64_t m_height; PowVariant m_powVariant; }; diff --git a/src/net/Network.cpp b/src/net/Network.cpp index f186b61a..78faa7e3 100644 --- a/src/net/Network.cpp +++ b/src/net/Network.cpp @@ -170,11 +170,11 @@ void Network::onResultAccepted(Client *client, const SubmitResult &result, const void Network::setJob(Client *client, const Job &job) { if (m_options->colors()) { - LOG_INFO("\x1B[01;35mnew job\x1B[0m from \x1B[01;37m%s:%d\x1B[0m with diff \x1B[01;37m%d\x1B[0m and PoW \x1B[01;37m%s", + LOG_INFO("\x1B[01;35mnew job\x1B[0m from \x1B[01;37m%s:%d\x1B[0m with diff \x1B[01;37m%d\x1B[0m variant \x1B[01;37m%s", client->host(), client->port(), job.diff(), getPowVariantName(job.powVariant()).c_str()); } else { - LOG_INFO("new job from %s:%d with diff %d and PoW %s", client->host(), client->port(), job.diff(), getPowVariantName(job.powVariant()).c_str()); + LOG_INFO("new job from %s:%d with diff %d variant %s", client->host(), client->port(), job.diff(), getPowVariantName(job.powVariant()).c_str()); } m_state.powVariant = job.powVariant(); diff --git a/src/version.h b/src/version.h index 4be54f8f..60265e7d 100644 --- a/src/version.h +++ b/src/version.h @@ -36,14 +36,14 @@ #define APP_DESC "XMRigCC CPU miner" #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id" #endif -#define APP_VERSION "1.8.13 (based on XMRig)" +#define APP_VERSION "1.9.0 (based on XMRig)" #define APP_DOMAIN "" #define APP_SITE "https://github.com/Bendr0id/xmrigCC" #define APP_KIND "cpu" #define APP_VER_MAJOR 1 -#define APP_VER_MINOR 8 -#define APP_VER_BUILD 13 +#define APP_VER_MINOR 9 +#define APP_VER_BUILD 0 #define APP_VER_REV 0 #ifndef NDEBUG diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index a0dd6ff8..3fccc058 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -141,7 +141,7 @@ void MultiWorker::start() *Job::nonce(m_state->blob + i * m_state->job.size()) = ++m_state->nonces[i]; } - CryptoNight::hash(m_hashFactor, Options::i()->asmOptimization(), m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads); + CryptoNight::hash(m_hashFactor, Options::i()->asmOptimization(), m_state->job.height(), m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads); for (size_t i=0; i < m_hashFactor; ++i) { if (*reinterpret_cast(m_hash + 24 + i * 32) < m_state->job.target()) {