From 1273e45e4685888593349741c3b405260a6c2049 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ben=20Gr=C3=A4f?= Date: Fri, 4 Jan 2019 18:34:48 +0000 Subject: [PATCH] Integrated new Algos (#224) - Added XLT v5/9 with autodetect(algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9)) - Added cn-lite variant UPX/uPlexa (algo: "cryptonight-lite", variant "upx") - Added force-pow-variant parameter to force usage of the variant from the config and skip parsing of pow/variant from job/pool --- CHANGELOG.md | 4 + src/Options.cpp | 56 ++- src/Options.h | 2 + src/PowVariant.h | 10 + src/Summary.cpp | 8 +- src/config.json | 3 +- src/crypto/CryptoNight.cpp | 51 +++ src/crypto/CryptoNight_test.h | 13 +- src/crypto/CryptoNight_x86.h | 116 ++++- ...cn_fastv2_double_main_loop_sandybridge.inc | 414 ++++++++++++++++++ .../asm/cn_fastv2_main_loop_bulldozer.inc | 180 ++++++++ .../asm/cn_fastv2_main_loop_ivybridge.inc | 186 ++++++++ src/crypto/asm/cn_fastv2_main_loop_ryzen.inc | 183 ++++++++ ...n_fastv2_mainloop_soft_aes_sandybridge.inc | 271 ++++++++++++ .../asm/cn_liteupx_mainloop_sandybridge.inc | 74 ++++ ..._liteupx_mainloop_soft_aes_sandybridge.inc | 166 +++++++ src/crypto/asm/cn_main_loop.S | 94 +++- ...cn_fastv2_double_main_loop_sandybridge.inc | 410 +++++++++++++++++ .../asm/win/cn_fastv2_main_loop_bulldozer.inc | 180 ++++++++ .../asm/win/cn_fastv2_main_loop_ivybridge.inc | 182 ++++++++ .../asm/win/cn_fastv2_main_loop_ryzen.inc | 179 ++++++++ ...n_fastv2_mainloop_soft_aes_sandybridge.inc | 267 +++++++++++ .../win/cn_liteupx_mainloop_sandybridge.inc | 70 +++ ..._liteupx_mainloop_soft_aes_sandybridge.inc | 162 +++++++ src/crypto/asm/win/cn_main_loop.asm | 51 ++- src/crypto/asm/win/cn_main_loop_win_gcc.S | 43 ++ src/default_miner_config.json | 3 +- src/net/Client.cpp | 38 +- src/net/Job.cpp | 4 + src/version.h | 4 +- 30 files changed, 3372 insertions(+), 52 deletions(-) create mode 100644 src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc create mode 100644 src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc create mode 100644 src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc create mode 100644 src/crypto/asm/cn_fastv2_main_loop_ryzen.inc create mode 100644 src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc create mode 100644 src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc create mode 100644 src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc create mode 100644 src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc create mode 100644 src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc create mode 100644 src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc create mode 100644 src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc create mode 100644 src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc create mode 100644 src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc create mode 100644 src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc diff --git a/CHANGELOG.md b/CHANGELOG.md index ee86a451..7e46109d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +# 1.8.8 +- Added XLT v5/9 with autodetect(algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9)) +- Added cn-lite variant UPX/uPlexa (algo: "cryptonight-lite", variant "upx") +- Added force-pow-variant parameter to force usage of the variant from the config and skip parsing of pow/variant from job/pool # 1.8.7 - Implemented Template based mass config editor to simple swap configs on your rigs # 1.8.6 diff --git a/src/Options.cpp b/src/Options.cpp index f38caaea..7efae639 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -73,7 +73,7 @@ Options:\n" -k, --keepalive send keepalived for prevent timeout (need pool support)\n\ -r, --retries=N number of times to retry before switch to backup server (default: 5)\n\ -R, --retry-pause=N time to pause between retries (default: 5)\n\ - --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for v5), 'rto', 'xfh'\n\ + --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for > v5), 'rto', 'xfh', 'upx'\n\ for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\ --asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\ --multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\ @@ -90,7 +90,9 @@ Options:\n" --print-time=N print hashrate report every N seconds\n\ --api-port=N port for the miner API\n\ --api-access-token=T access token for API\n\ - --api-worker-id=ID custom worker-id for API\n" + --api-worker-id=ID custom worker-id for API\n\ + --reboot-cmd command/bat to execute to Reboot miner\n\ + --force-pow-variant disable pow/variant parsing from pool\n" # ifndef XMRIG_NO_CC "\ --cc-url=URL url of the CC Server\n\ @@ -99,8 +101,7 @@ Options:\n" --cc-worker-id=ID custom worker-id for CC Server\n\ --cc-update-interval-s=N status update interval in seconds (default: 10 min: 1)\n\ --cc-use-remote-logging enable remote logging on CC Server\n\ - --cc-upload-config-on-startup upload current miner config to CC Server on startup\n\ - --cc-reboot-cmd command/bat to execute to Reboot miner\n" + --cc-upload-config-on-startup upload current miner config to CC Server on startup\n" # endif # endif @@ -175,8 +176,9 @@ static struct option const options[] = { { "userpass", 1, nullptr, 'O' }, { "version", 0, nullptr, 'V' }, { "use-tls", 0, nullptr, 1015 }, - { "force-pow-version", 1, nullptr, 1016 }, - { "pow-variant" ,1, nullptr, 1017 }, + { "force-pow-variant", 0, nullptr, 1016 }, + { "pow-variant", 1, nullptr, 1017 }, + { "variant", 1, nullptr, 1017 }, { "api-port", 1, nullptr, 4000 }, { "api-access-token", 1, nullptr, 4001 }, { "api-worker-id", 1, nullptr, 4002 }, @@ -232,8 +234,9 @@ static struct option const config_options[] = { { "syslog", 0, nullptr, 'S' }, { "threads", 1, nullptr, 't' }, { "user-agent", 1, nullptr, 1008 }, - { "force-pow-version", 1, nullptr, 1016 }, + { "force-pow-variant", 0, nullptr, 1016 }, { "pow-variant", 1, nullptr, 1017 }, + { "variant", 1, nullptr, 1017 }, { "doublehash-thread-mask", 1, nullptr, 4013 }, { "multihash-thread-mask", 1, nullptr, 4013 }, { "asm-optimization", 1, nullptr, 4020 }, @@ -250,6 +253,8 @@ static struct option const pool_options[] = { { "keepalive", 0, nullptr ,'k' }, { "nicehash", 0, nullptr, 1006 }, { "use-tls", 0, nullptr, 1015 }, + { "pow-variant", 1, nullptr, 1017 }, + { "variant", 1, nullptr, 1017 }, { nullptr, 0, nullptr, 0 } }; @@ -318,7 +323,9 @@ constexpr static const char *pow_variant_names[] = { "msr", "xhv", "rto", - "xfh" + "xfh", + "xtlv9", + "upx" }; constexpr static const char *asm_optimization_names[] = { @@ -366,6 +373,7 @@ Options::Options(int argc, char **argv) : m_ccPushOfflineMiners(false), m_ccPushPeriodicStatus(false), m_ccPushZeroHashrateMiners(false), + m_forcePowVariant(false), m_fileName(Platform::defaultConfigName()), m_apiToken(nullptr), m_apiWorkerId(nullptr), @@ -606,7 +614,6 @@ bool Options::parseArg(int key, const char *arg) case 1003: /* --donate-level */ case 1004: /* --max-cpu-usage */ case 1007: /* --print-time */ - case 1016: /* --force-pow-version */ case 1021: /* --cpu-priority */ case 4000: /* --api-port */ case 4006: /* --cc-port */ @@ -628,7 +635,10 @@ bool Options::parseArg(int key, const char *arg) case 1015: /* --use-tls */ return parseBoolean(key, true); - case 1017: /* --pow-variant */ + case 1016: /* --force-pow-variant */ + return parseBoolean(key, false); + + case 1017: /* --pow-variant/--variant */ return parsePowVariant(arg); case 4016: /* --cc-use-tls */ @@ -803,16 +813,6 @@ bool Options::parseArg(int key, uint64_t arg) m_printTime = (int) arg; break; - case 1016: /* --force-pow-version */ - showDeprecateWarning("force-pow-version", "pow-variant"); - if (arg != POW_AUTODETECT && arg != POW_V0 && arg != POW_V1) { - showUsage(1); - return false; - } - - m_powVariant = static_cast(arg); - break; - case 1020: /* --cpu-affinity */ if (arg) { m_affinity = arg; @@ -901,6 +901,10 @@ bool Options::parseBoolean(int key, bool enable) m_pools.back()->setUseTls(enable); break; + case 1016: /* --force-pow-variant */ + m_forcePowVariant = enable; + break; + case 2000: /* --colors */ m_colors = enable; break; @@ -1151,11 +1155,21 @@ bool Options::parsePowVariant(const char *powVariant) break; } - if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "freehaven") || !strcmp(powVariant, "faven"))) { + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "freehaven") || !strcmp(powVariant, "faven") || !strcmp(powVariant, "swap"))) { m_powVariant = POW_XFH; break; } + if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "stellitev9")) { + m_powVariant = POW_XTL_V9; + break; + } + + if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "uplexa")) { + m_powVariant = POW_UPX; + break; + } + if (i == ARRAY_SIZE(pow_variant_names) - 1) { showUsage(1); return false; diff --git a/src/Options.h b/src/Options.h index a91508f1..7d4c1309 100644 --- a/src/Options.h +++ b/src/Options.h @@ -80,6 +80,7 @@ public: inline bool ccPushZeroHashrateMiners() const { return m_ccPushZeroHashrateMiners; } inline bool ccUsePushover() const { return ccPushoverUser() && ccPushoverToken(); } inline bool ccUseTelegram() const { return ccTelegramBotToken() && ccTelegramChatId(); } + inline bool forcePowVariant() const { return m_forcePowVariant; }; inline const char *fileName() const { return m_fileName; } inline const char *apiToken() const { return m_apiToken; } inline const char *apiWorkerId() const { return m_apiWorkerId; } @@ -165,6 +166,7 @@ private: bool m_ccPushOfflineMiners; bool m_ccPushPeriodicStatus; bool m_ccPushZeroHashrateMiners; + bool m_forcePowVariant; const char* m_fileName; char *m_apiToken; char *m_apiWorkerId; diff --git a/src/PowVariant.h b/src/PowVariant.h index b1ca313c..81eb6e2c 100644 --- a/src/PowVariant.h +++ b/src/PowVariant.h @@ -35,6 +35,8 @@ enum PowVariant POW_XHV, POW_RTO, POW_XFH, + POW_XTL_V9, + POW_UPX, LAST_ITEM }; @@ -62,6 +64,10 @@ inline std::string getPowVariantName(PowVariant powVariant) return "rto"; case POW_XFH: return "xfh"; + case POW_XTL_V9: + return "xtlv9"; + case POW_UPX: + return "upx"; case POW_AUTODETECT: default: return "-1"; @@ -129,6 +135,10 @@ inline PowVariant parseVariant(const std::string variant) powVariant = PowVariant::POW_RTO; } else if (variant == "xfh" || variant == "freehaven" || variant == "faven") { powVariant = PowVariant::POW_XFH; + } else if (variant == "xtlv9" || variant == "stellite_v9") { + powVariant = PowVariant::POW_XTL_V9; + } else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") { + powVariant = PowVariant::POW_XTL_V9; } return powVariant; diff --git a/src/Summary.cpp b/src/Summary.cpp index 1c8de9e3..b55be0e8 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -63,8 +63,8 @@ static void print_cpu() Cpu::brand(), Cpu::sockets(), Cpu::isX64() ? "\x1B[01;32m" : "\x1B[01;31m-", - Cpu::hasAES() ? "\x1B[01;32m" : "\x1B[01;31m-", - Options::i()->asmOptimization() != AsmOptimization::ASM_OFF ? "\x1B[01;32m" : "\x1B[01;31m", + Cpu::hasAES() && Options::i()->aesni() ? "\x1B[01;32m" : "\x1B[01;31m-", + Options::i()->asmOptimization() != AsmOptimization::ASM_OFF ? "\x1B[01;32m" : "\x1B[01;31m-", getAsmOptimizationName(Options::i()->asmOptimization()).c_str()); # ifndef XMRIG_NO_LIBCPUID Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU L2/L3: %.1f MB/%.1f MB", Cpu::l2() / 1024.0, Cpu::l3() / 1024.0); @@ -121,12 +121,10 @@ static void print_threads() } Log::i()->text(Options::i()->colors() ? - "\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, %saes=%d\x1B[01;37m, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" : + "\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" : " * THREADS: %d, %s, %saes=%d, hf=%zu, %sdonate=%d%%%s%s", Options::i()->threads(), Options::i()->algoName(), - Options::i()->colors() && Options::i()->aesni() == 0 ? "\x1B[01;31m" : "", - Options::i()->aesni(), Options::i()->hashFactor(), Options::i()->colors() && Options::i()->donateLevel() == 0 ? "\x1B[01;31m" : "", Options::i()->donateLevel(), diff --git a/src/config.json b/src/config.json index e735ae76..5e8e6eaa 100644 --- a/src/config.json +++ b/src/config.json @@ -4,7 +4,7 @@ "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) - "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for v5), msr, xhv, rto, xfh + "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) @@ -20,6 +20,7 @@ "safe": false, // true to safe adjust threads and av settings for current CPU "syslog": false, // use system log for output messages "reboot-cmd" : "", // command to execute to reboot the OS + "force-pow-variant" : false, // force pow variant, dont parse pow/variant from pool job "pools": [ { "url": "donate2.graef.in:80", // URL of mining server diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index 67e7ac5c..c31f28ba 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -70,6 +70,18 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } #endif +} else if (powVersion == PowVariant::POW_XTL_V9) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || + (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || + (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif } else if (powVersion == PowVariant::POW_MSR) { #if defined(XMRIG_ARM) CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); @@ -111,6 +123,16 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV } else { CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } +#endif + } else if (powVersion == PowVariant::POW_XTL_V9) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } #endif } else if (powVersion == PowVariant::POW_ALLOY) { CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); @@ -158,6 +180,16 @@ static void cryptonight_lite_aesni(AsmOptimization asmOptimization, PowVariant p #endif } else if (powVersion == PowVariant::POW_TUBE) { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + } else if (powVersion == PowVariant::POW_UPX) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif } else { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } @@ -178,6 +210,16 @@ static void cryptonight_lite_softaes(AsmOptimization asmOptimization, PowVariant #endif } else if (powVersion == PowVariant::POW_TUBE) { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); + } else if (powVersion == PowVariant::POW_UPX) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } +#endif } else { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } @@ -430,6 +472,10 @@ bool CryptoNight::selfTest(int algo) resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 160) == 0; #endif + // cn-lite upx + + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_UPX, test_input, 76, output, scratchPads); + resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0; } else { // cn v0 aka orignal @@ -525,6 +571,11 @@ bool CryptoNight::selfTest(int algo) cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xfh, 32) == 0; + + // cnv8 + xtl aka cn-fast2 + + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL_V9, test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_xtl_v9, 32) == 0; } for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) { diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 8cf0d637..d2f2634d 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -122,6 +122,12 @@ const static uint8_t test_output_xfh[32] = { 0x54, 0x71, 0x58, 0xDB, 0x94, 0x69, 0x8E, 0x3C, 0xA0, 0x3D, 0xE4, 0x81, 0x9A, 0x65, 0x9F, 0xEF }; +// CN XTL V9 +const static uint8_t test_output_xtl_v9[32] = { + 0x5D, 0x4F, 0xBC, 0x35, 0x60, 0x97, 0xEA, 0x64, 0x40, 0xB0, 0x88, 0x8E, 0xDE, 0xB6, 0x35, 0xDD, + 0xC8, 0x4A, 0x0E, 0x39, 0x7C, 0x86, 0x84, 0x56, 0x89, 0x5C, 0x3F, 0x29, 0xBE, 0x73, 0x12, 0xA7 +}; + // CN-LITE const static uint8_t test_output_v0_lite[160] = { 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, @@ -151,7 +157,6 @@ const static uint8_t test_output_v1_lite[160] = { 0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6 }; - // CN-Lite IPBC const static uint8_t test_output_ipbc_lite[160] = { 0xE4, 0x93, 0x8C, 0xAA, 0x59, 0x8D, 0x02, 0x8A, 0xB8, 0x6F, 0x25, 0xD2, 0xB1, 0x23, 0xD0, 0xD5, @@ -167,6 +172,12 @@ const static uint8_t test_output_ipbc_lite[160] = { }; +// CN-Lite v7 +const static uint8_t test_output_upx[32] = { + 0xD1, 0x13, 0xE1, 0x1B, 0xBE, 0xD3, 0x2A, 0xC1, 0x7C, 0x2C, 0xAA, 0x55, 0xCC, 0x84, 0x2F, 0xA4, + 0x88, 0x91, 0xEE, 0x45, 0x63, 0x22, 0xA3, 0x0A, 0xB2, 0x80, 0xDF, 0x35, 0x16, 0x5C, 0xAF, 0x9A +}; + // CN-Heavy const static uint8_t test_output_heavy[160] = { 0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64, diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 387794cd..82fe8e40 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -57,10 +57,17 @@ extern "C" void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0); void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0); void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + void cn_fastv2_mainloop_ivybridge_asm(ScratchPad* ctx0); + void cn_fastv2_mainloop_ryzen_asm(ScratchPad* ctx0); + void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0); + void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0); void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); #endif } @@ -768,7 +775,8 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS];CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube( + input, size, output, scratchPad); uint64_t sqrt_result[NUM_HASH_BLOCKS]; __m128i bx0[NUM_HASH_BLOCKS]; __m128i bx1[NUM_HASH_BLOCKS]; @@ -875,6 +883,15 @@ public: // not supported } + inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -1433,6 +1450,8 @@ public: } else { cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); } + } else { + cn_liteupx_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); } } else { if (ITERATIONS == 0x80000) { @@ -1443,6 +1462,8 @@ public: } else { cn_litev1_mainloop_sandybridge_asm(scratchPad[0]); } + } else { + cn_liteupx_mainloop_sandybridge_asm(scratchPad[0]); } } #endif @@ -1559,6 +1580,40 @@ public: extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } + // single asm + inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + const uint8_t* l = scratchPad[0]->memory; + uint64_t* h = reinterpret_cast(scratchPad[0]->state); + + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + +#ifndef XMRIG_NO_ASM + if (asmOptimization == AsmOptimization::ASM_INTEL) { + if (SOFT_AES) { + scratchPad[0]->input = input; + scratchPad[0]->t_fn = (const uint32_t*)saes_table; + cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } else { + cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]); + } + } else if (asmOptimization == AsmOptimization::ASM_RYZEN) { + cn_fastv2_mainloop_ryzen_asm(scratchPad[0]); + } else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) { + cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]); + } +#endif + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -2278,6 +2333,38 @@ public: extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } + // double asm + inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + +#ifndef XMRIG_NO_ASM + cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); +#endif + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -3225,6 +3312,15 @@ public: // not supported } + inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -4480,6 +4576,15 @@ public: // not supported } + inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -5405,6 +5510,15 @@ public: // not supported } + inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad, + AsmOptimization asmOptimization) + { + // not supported + } + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, diff --git a/src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc b/src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..72ab414d --- /dev/null +++ b/src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc @@ -0,0 +1,414 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 262144 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +main_loop_double_fast2_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_fast2_sandybridge +div_fix_1_ret_fast2_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_fast2_sandybridge +div_fix_2_ret_fast2_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_fast2_sandybridge +sqrt_fix_1_ret_fast2_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_fast2_sandybridge +sqrt_fix_2_ret_fast2_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_fast2_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp + +div_fix_1_fast2_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_fast2_sandybridge + +div_fix_2_fast2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_fast2_sandybridge + +sqrt_fix_1_fast2_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_fast2_sandybridge + +sqrt_fix_2_fast2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_fast2_sandybridge + +cnv2_double_mainloop_asm_fast2_sandybridge_endp: diff --git a/src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc b/src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc new file mode 100644 index 00000000..8d341665 --- /dev/null +++ b/src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_fast2_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movq r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movq xmm0, rax + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_fast2_bulldozer + shr rdi, 19 + +sqrt_fixup_fast2_bulldozer_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_fast2_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_fast2_bulldozer_endp + +sqrt_fixup_fast2_bulldozer: + movq r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_fast2_bulldozer_ret + +cnv2_main_loop_fast2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc b/src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc new file mode 100644 index 00000000..8dd92f3b --- /dev/null +++ b/src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 262144 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +$main_loop_fast2_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test edx, 524287 + je $sqrt_fixup_fast2_ivybridge + psrlq xmm3, 19 +$sqrt_fixup_fast2_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne $main_loop_fast2_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp $cnv2_main_loop_fast2_ivybridge_endp + +$sqrt_fixup_fast2_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp $sqrt_fixup_fast2_ivybridge_ret + +$cnv2_main_loop_fast2_ivybridge_endp: diff --git a/src/crypto/asm/cn_fastv2_main_loop_ryzen.inc b/src/crypto/asm/cn_fastv2_main_loop_ryzen.inc new file mode 100644 index 00000000..e4012d0c --- /dev/null +++ b/src/crypto/asm/cn_fastv2_main_loop_ryzen.inc @@ -0,0 +1,183 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +$main_loop_fast2_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je $sqrt_fixup_fast2_ryzen + shr rdi, 19 + +$sqrt_fixup_fast2_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne $main_loop_fast2_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp $cnv2_main_loop_fast2_ryzen_endp + +$sqrt_fixup_fast2_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp $sqrt_fixup_fast2_ryzen_ret + +$cnv2_main_loop_fast2_ryzen_endp: diff --git a/src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..3324137e --- /dev/null +++ b/src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,271 @@ + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 152 + + stmxcsr DWORD PTR [rsp+4] + mov DWORD PTR [rsp], 24448 + ldmxcsr DWORD PTR [rsp] + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+248], rax + movq xmm12, r11 + mov QWORD PTR [rsp+240], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 262144 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cnv2_mainloop_soft_aes_fast2_sandybridge: + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+248] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + xor edx, edx + mov rax, rcx + shl rax, 32 + movq rbx, xmm10 + xor rbx, rax + lea r9, QWORD PTR [rcx+rcx] + add r9d, edi + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov ecx, -2147483647 + movdqu XMMWORD PTR [r13], xmm0 + or r9, rcx + movdqa xmm0, xmm6 + movaps xmm1, xmm9 + psrldq xmm0, 8 + movq rax, xmm0 + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + div r9 + shl rdx, 32 + mov eax, eax + add rdx, rax + lea r9, QWORD PTR [rdx+rdi] + movq xmm10, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm1, xmm0 + movq rdx, xmm1 + test rdx, 524287 + je sqrt_fixup_soft_aes_fast2_sandybridge + psrlq xmm1, 19 +sqrt_fixup_soft_aes_fast2_sandybridge_ret: + + mov r9, r10 + movdqa xmm13, xmm1 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movq xmm0, rax + movq xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+240] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+224] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+240], r9 + mov QWORD PTR [rsp+248], rax + sub r12d, 1 + jne cnv2_mainloop_soft_aes_fast2_sandybridge + + ldmxcsr DWORD PTR [rsp+4] + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 152 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp + +sqrt_fixup_soft_aes_fast2_sandybridge: + dec rdx + mov r15d, -1022 + shl r15, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + lea rcx, [rcx+r15+1] + add rax, r15 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm1, rdx + jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret + +cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp: diff --git a/src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc b/src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc new file mode 100644 index 00000000..4dae0c33 --- /dev/null +++ b/src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc @@ -0,0 +1,74 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r14 + push r15 + mov rax, QWORD PTR [rcx+48] + mov ebp, 131072 + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm3, rax + mov rax, QWORD PTR [rcx+256] + mov rdi, QWORD PTR [rcx+40] + movq xmm0, rdx + xor rdi, QWORD PTR [rcx+8] + mov rdx, r8 + mov r15, QWORD PTR [rcx+264] + and edx, 1048560 + mov r14, QWORD PTR [rax+35] + xor r14, QWORD PTR [rcx+192] + mov rsi, QWORD PTR [rcx+224] + punpcklqdq xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cn_liteupx_mainloop_sandybridge: + movq xmm0, rdi + movq xmm1, r8 + punpcklqdq xmm1, xmm0 + aesenc xmm2, xmm1 + movq r10, xmm2 + mov r9d, r10d + and r9d, 1048560 + add r9, rsi + movdqa xmm0, xmm2 + pxor xmm0, xmm3 + movdqa xmm3, xmm2 + movdqu XMMWORD PTR [rdx+rsi], xmm0 + psrldq xmm0, 11 + movq rax, xmm0 + movzx eax, al + movzx eax, BYTE PTR [rax+r15] + mov BYTE PTR [rsi+rdx+11], al + mov rbx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + mul r10 + add r8, rdx + mov QWORD PTR [r9], r8 + add rdi, rax + mov rax, r14 + xor rax, rdi + mov QWORD PTR [r9+8], rax + xor r8, rbx + mov rdx, r8 + and edx, 1048560 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + xor rdi, r11 + dec ebp + jne cn_liteupx_mainloop_sandybridge + + mov rbx, QWORD PTR [rsp+24] + mov rbp, QWORD PTR [rsp+32] + mov rsi, QWORD PTR [rsp+40] + mov rdi, QWORD PTR [rsp+48] + pop r15 + pop r14 diff --git a/src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..880f8b09 --- /dev/null +++ b/src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,166 @@ + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 72 + + movaps XMMWORD PTR [rsp], xmm6 + movaps XMMWORD PTR [rsp+16], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + movaps XMMWORD PTR [rsp+48], xmm9 + + mov rax, QWORD PTR [rcx+48] + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm4, rax + mov rax, QWORD PTR [rcx+256] + mov r13, QWORD PTR [rcx+40] + movq xmm0, rdx + xor r13, QWORD PTR [rcx+8] + mov rdx, r8 + mov rdi, QWORD PTR [rcx+224] + and edx, 1048560 + mov rax, QWORD PTR [rax+35] + xor rax, QWORD PTR [rcx+192] + movq xmm5, rax + movq xmm8, rdi + punpcklqdq xmm4, xmm0 + mov QWORD PTR [rsp+64], rdx + + movq xmm6, rcx + mov rax, QWORD PTR [rcx+264] + movq xmm7, rax + + mov eax, 131072 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cn_liteupx_mainloop_soft_aes_sandybridge: + movq xmm9, rax + mov r12, QWORD PTR [rcx+272] + mov esi, DWORD PTR [rdx+rdi] + mov r10d, DWORD PTR [rdx+rdi+4] + mov ebp, DWORD PTR [rdx+rdi+12] + mov r14d, DWORD PTR [rdx+rdi+8] + mov rdx, QWORD PTR [rsp+64] + movzx ecx, sil + shr esi, 8 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + add ebp, 256 + movd xmm1, r11d + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movq rdi, xmm8 + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + punpckldq xmm2, xmm1 + movq xmm1, r8 + xor eax, DWORD PTR [r12+rcx*4] + xor eax, r15d + movd xmm3, eax + movq rax, xmm7 + punpckldq xmm3, xmm0 + movq xmm0, r13 + punpcklqdq xmm1, xmm0 + punpckldq xmm3, xmm2 + pxor xmm3, xmm1 + movq r9, xmm3 + mov r10d, r9d + and r10d, 1048560 + movdqa xmm0, xmm3 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx+rdi], xmm0 + psrldq xmm0, 11 + movq rcx, xmm0 + movzx ecx, cl + mov cl, BYTE PTR [rcx+rax] + mov BYTE PTR [rdi+rdx+11], cl + mov rbx, QWORD PTR [r10+rdi] + mov rcx, r9 + lea r9, QWORD PTR [r10+rdi] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + movdqa xmm4, xmm3 + mul rcx + movq rcx, xmm6 + add r8, rdx + add r13, rax + movq rax, xmm5 + xor rax, r13 + mov QWORD PTR [r9], r8 + xor r8, rbx + mov QWORD PTR [r9+8], rax + movq rax, xmm9 + mov rdx, r8 + xor r13, r11 + and edx, 1048560 + mov QWORD PTR [rsp+64], rdx + sub eax, 1 + jne cn_liteupx_mainloop_soft_aes_sandybridge + + movaps xmm6, XMMWORD PTR [rsp] + movaps xmm7, XMMWORD PTR [rsp+16] + movaps xmm8, XMMWORD PTR [rsp+32] + movaps xmm9, XMMWORD PTR [rsp+48] + + add rsp, 72 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index ec139a5f..e423543b 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -14,11 +14,18 @@ .global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm) +.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm) +.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm) +.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) #ifdef __APPLE__ ALIGN 16 @@ -105,6 +112,67 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): add rsp, 48 ret 0 +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_fastv2_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_fastv2_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_fastv2_main_loop_ryzen.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_fastv2_main_loop_bulldozer.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cn_fastv2_double_main_loop_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_liteupx_mainloop_sandybridge.inc" + add rsp, 48 + ret 0 + #ifdef __APPLE__ ALIGN 16 #else @@ -151,4 +219,28 @@ FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): mov rcx, rdi #include "cnv2_mainloop_soft_aes_sandybridge.inc" add rsp, 48 - ret 0 \ No newline at end of file + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_fastv2_mainloop_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_liteupx_mainloop_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 diff --git a/src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc b/src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..d712e10e --- /dev/null +++ b/src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 262144 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 64 +main_loop_double_fast2_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_fast2_sandybridge +div_fix_1_ret_fast2_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_fast2_sandybridge +div_fix_2_ret_fast2_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_fast2_sandybridge +sqrt_fix_1_ret_fast2_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_fast2_sandybridge +sqrt_fix_2_ret_fast2_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_fast2_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp + +div_fix_1_fast2_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_fast2_sandybridge + +div_fix_2_fast2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_fast2_sandybridge + +sqrt_fix_1_fast2_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_fast2_sandybridge + +sqrt_fix_2_fast2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_fast2_sandybridge + +cnv2_double_mainloop_asm_fast2_sandybridge_endp: diff --git a/src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc b/src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc new file mode 100644 index 00000000..a73752fc --- /dev/null +++ b/src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_fast2_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movd r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movd xmm0, rax + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_fast2_bulldozer + shr rdi, 19 + +sqrt_fixup_fast2_bulldozer_ret: + mov rax, rsi + mul r14 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_fast2_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_fast2_bulldozer_endp + +sqrt_fixup_fast2_bulldozer: + movd r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_fast2_bulldozer_ret + +cnv2_main_loop_fast2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc b/src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc new file mode 100644 index 00000000..bc3d592c --- /dev/null +++ b/src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc @@ -0,0 +1,182 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 262144 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 64 +$main_loop_fast2_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test edx, 524287 + je $sqrt_fixup_fast2_ivybridge + psrlq xmm3, 19 +$sqrt_fixup_fast2_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne $main_loop_fast2_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp $cnv2_main_loop_fast2_ivybridge_endp + +$sqrt_fixup_fast2_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp $sqrt_fixup_fast2_ivybridge_ret + +$cnv2_main_loop_fast2_ivybridge_endp: diff --git a/src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc b/src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc new file mode 100644 index 00000000..2bf76e19 --- /dev/null +++ b/src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 262144 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 64 +$main_loop_fast2_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je $sqrt_fixup_fast2_ryzen + shr rdi, 19 + +$sqrt_fixup_fast2_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne $main_loop_fast2_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp $cnv2_main_loop_fast2_ryzen_endp + +$sqrt_fixup_fast2_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp $sqrt_fixup_fast2_ryzen_ret + +$cnv2_main_loop_fast2_ryzen_endp: diff --git a/src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..2e678c04 --- /dev/null +++ b/src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,267 @@ + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 152 + + stmxcsr DWORD PTR [rsp+4] + mov DWORD PTR [rsp], 24448 + ldmxcsr DWORD PTR [rsp] + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 2097136 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+248], rax + movq xmm12, r11 + mov QWORD PTR [rsp+240], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 262144 + + ALIGN 64 +cnv2_mainloop_soft_aes_fast2_sandybridge: + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+248] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 2097136 + xor edx, edx + mov rax, rcx + shl rax, 32 + movq rbx, xmm10 + xor rbx, rax + lea r9, QWORD PTR [rcx+rcx] + add r9d, edi + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov ecx, -2147483647 + movdqu XMMWORD PTR [r13], xmm0 + or r9, rcx + movdqa xmm0, xmm6 + movaps xmm1, xmm9 + psrldq xmm0, 8 + movq rax, xmm0 + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + div r9 + shl rdx, 32 + mov eax, eax + add rdx, rax + lea r9, QWORD PTR [rdx+rdi] + movq xmm10, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm1, xmm0 + movq rdx, xmm1 + test rdx, 524287 + je sqrt_fixup_soft_aes_fast2_sandybridge + psrlq xmm1, 19 +sqrt_fixup_soft_aes_fast2_sandybridge_ret: + + mov r9, r10 + movdqa xmm13, xmm1 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movq xmm0, rax + movq xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+240] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+224] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 2097136 + xor r9, rbp + mov QWORD PTR [rsp+240], r9 + mov QWORD PTR [rsp+248], rax + sub r12d, 1 + jne cnv2_mainloop_soft_aes_fast2_sandybridge + + ldmxcsr DWORD PTR [rsp+4] + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 152 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp + +sqrt_fixup_soft_aes_fast2_sandybridge: + dec rdx + mov r15d, -1022 + shl r15, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + lea rcx, [rcx+r15+1] + add rax, r15 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm1, rdx + jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret + +cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp: diff --git a/src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc b/src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc new file mode 100644 index 00000000..b6bc2e6c --- /dev/null +++ b/src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc @@ -0,0 +1,70 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r14 + push r15 + mov rax, QWORD PTR [rcx+48] + mov ebp, 131072 + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm3, rax + mov rax, QWORD PTR [rcx+256] + mov rdi, QWORD PTR [rcx+40] + movq xmm0, rdx + xor rdi, QWORD PTR [rcx+8] + mov rdx, r8 + mov r15, QWORD PTR [rcx+264] + and edx, 1048560 + mov r14, QWORD PTR [rax+35] + xor r14, QWORD PTR [rcx+192] + mov rsi, QWORD PTR [rcx+224] + punpcklqdq xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + + ALIGN 64 +cn_litev1_mainloop_sandybridge: + movq xmm0, rdi + movq xmm1, r8 + punpcklqdq xmm1, xmm0 + aesenc xmm2, xmm1 + movq r10, xmm2 + mov r9d, r10d + and r9d, 1048560 + add r9, rsi + movdqa xmm0, xmm2 + pxor xmm0, xmm3 + movdqa xmm3, xmm2 + movdqu XMMWORD PTR [rdx+rsi], xmm0 + psrldq xmm0, 11 + movq rax, xmm0 + movzx eax, al + movzx eax, BYTE PTR [rax+r15] + mov BYTE PTR [rsi+rdx+11], al + mov rbx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + mul r10 + add r8, rdx + mov QWORD PTR [r9], r8 + add rdi, rax + mov rax, r14 + xor rax, rdi + mov QWORD PTR [r9+8], rax + xor r8, rbx + mov rdx, r8 + and edx, 1048560 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + xor rdi, r11 + dec ebp + jne cn_litev1_mainloop_sandybridge + + mov rbx, QWORD PTR [rsp+24] + mov rbp, QWORD PTR [rsp+32] + mov rsi, QWORD PTR [rsp+40] + mov rdi, QWORD PTR [rsp+48] + pop r15 + pop r14 diff --git a/src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..94f083c1 --- /dev/null +++ b/src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,162 @@ + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 72 + + movaps XMMWORD PTR [rsp], xmm6 + movaps XMMWORD PTR [rsp+16], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + movaps XMMWORD PTR [rsp+48], xmm9 + + mov rax, QWORD PTR [rcx+48] + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm4, rax + mov rax, QWORD PTR [rcx+256] + mov r13, QWORD PTR [rcx+40] + movq xmm0, rdx + xor r13, QWORD PTR [rcx+8] + mov rdx, r8 + mov rdi, QWORD PTR [rcx+224] + and edx, 1048560 + mov rax, QWORD PTR [rax+35] + xor rax, QWORD PTR [rcx+192] + movq xmm5, rax + movq xmm8, rdi + punpcklqdq xmm4, xmm0 + mov QWORD PTR [rsp+64], rdx + + movq xmm6, rcx + mov rax, QWORD PTR [rcx+264] + movq xmm7, rax + + mov eax, 131072 + + ALIGN 64 +cn_litev1_mainloop_soft_aes_sandybridge: + movq xmm9, rax + mov r12, QWORD PTR [rcx+272] + mov esi, DWORD PTR [rdx+rdi] + mov r10d, DWORD PTR [rdx+rdi+4] + mov ebp, DWORD PTR [rdx+rdi+12] + mov r14d, DWORD PTR [rdx+rdi+8] + mov rdx, QWORD PTR [rsp+64] + movzx ecx, sil + shr esi, 8 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + add ebp, 256 + movd xmm1, r11d + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movq rdi, xmm8 + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + punpckldq xmm2, xmm1 + movq xmm1, r8 + xor eax, DWORD PTR [r12+rcx*4] + xor eax, r15d + movd xmm3, eax + movq rax, xmm7 + punpckldq xmm3, xmm0 + movq xmm0, r13 + punpcklqdq xmm1, xmm0 + punpckldq xmm3, xmm2 + pxor xmm3, xmm1 + movq r9, xmm3 + mov r10d, r9d + and r10d, 1048560 + movdqa xmm0, xmm3 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx+rdi], xmm0 + psrldq xmm0, 11 + movq rcx, xmm0 + movzx ecx, cl + mov cl, BYTE PTR [rcx+rax] + mov BYTE PTR [rdi+rdx+11], cl + mov rbx, QWORD PTR [r10+rdi] + mov rcx, r9 + lea r9, QWORD PTR [r10+rdi] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + movdqa xmm4, xmm3 + mul rcx + movq rcx, xmm6 + add r8, rdx + add r13, rax + movq rax, xmm5 + xor rax, r13 + mov QWORD PTR [r9], r8 + xor r8, rbx + mov QWORD PTR [r9+8], rax + movq rax, xmm9 + mov rdx, r8 + xor r13, r11 + and edx, 1048560 + mov QWORD PTR [rsp+64], rdx + sub eax, 1 + jne cn_litev1_mainloop_soft_aes_sandybridge + + movaps xmm6, XMMWORD PTR [rsp] + movaps xmm7, XMMWORD PTR [rsp+16] + movaps xmm8, XMMWORD PTR [rsp+32] + movaps xmm9, XMMWORD PTR [rsp+48] + + add rsp, 72 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx diff --git a/src/crypto/asm/win/cn_main_loop.asm b/src/crypto/asm/win/cn_main_loop.asm index b2b27099..81ed5691 100644 --- a/src/crypto/asm/win/cn_main_loop.asm +++ b/src/crypto/asm/win/cn_main_loop.asm @@ -6,11 +6,18 @@ PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm +PUBLIC cn_fast2_mainloop_ivybridge_asm +PUBLIC cn_fast2_mainloop_ryzen_asm +PUBLIC cn_fast2_mainloop_bulldozer_asm +PUBLIC cn_fast2_double_mainloop_sandybridge_asm +PUBLIC cn_liteupx_mainloop_sandybridge_asm PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm +PUBLIC cn_fast2_mainloop_soft_aes_sandybridge_asm +PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm ALIGN 64 cnv1_mainloop_sandybridge_asm PROC @@ -54,6 +61,36 @@ cnv2_double_mainloop_sandybridge_asm PROC ret 0 cnv2_double_mainloop_sandybridge_asm ENDP +ALIGN 64 +cn_fast2_mainloop_ivybridge_asm PROC + INCLUDE cn_fast2_main_loop_ivybridge.inc + ret 0 +cn_fast2_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cn_fast2_mainloop_ryzen_asm PROC + INCLUDE cn_fast2_main_loop_ryzen.inc + ret 0 +cn_fast2_mainloop_ryzen_asm ENDP + +ALIGN 64 +cn_fast2_mainloop_bulldozer_asm PROC + INCLUDE cn_fast2_main_loop_bulldozer.inc + ret 0 +cn_fast2_mainloop_bulldozer_asm ENDP + +ALIGN 64 +cn_fast2_double_mainloop_sandybridge_asm PROC + INCLUDE cn_fast2_double_main_loop_sandybridge.inc + ret 0 +cn_fast2_double_mainloop_sandybridge_asm ENDP + +ALIGN 64 +cn_liteupx_mainloop_sandybridge_asm PROC + INCLUDE cn_liteupx_mainloop_sandybridge.inc + ret 0 +cn_liteupx_mainloop_sandybridge_asm ENDP + ALIGN 64 cnv1_mainloop_soft_aes_sandybridge_asm PROC INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc @@ -78,5 +115,17 @@ cnv2_mainloop_soft_aes_sandybridge_asm PROC ret 0 cnv2_mainloop_soft_aes_sandybridge_asm ENDP +ALIGN 64 +cn_fast2_mainloop_soft_aes_sandybridge_asm PROC + INCLUDE cn_fast2_mainloop_soft_aes_sandybridge.inc + ret 0 +cn_fast2_mainloop_soft_aes_sandybridge_asm ENDP + _TEXT_CN_MAINLOOP ENDS -END \ No newline at end of file +END + +ALIGN 64 +cn_liteupx_mainloop_soft_aes_sandybridge_asm PROC + INCLUDE cn_liteupx_mainloop_soft_aes_sandybridge.inc + ret 0 +cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP \ No newline at end of file diff --git a/src/crypto/asm/win/cn_main_loop_win_gcc.S b/src/crypto/asm/win/cn_main_loop_win_gcc.S index a550868c..db3e6c30 100644 --- a/src/crypto/asm/win/cn_main_loop_win_gcc.S +++ b/src/crypto/asm/win/cn_main_loop_win_gcc.S @@ -10,11 +10,18 @@ .global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_fast2_mainloop_ivybridge_asm) +.global FN_PREFIX(cn_fast2_mainloop_ryzen_asm) +.global FN_PREFIX(cn_fast2_mainloop_bulldozer_asm) +.global FN_PREFIX(cn_fast2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_fast2_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) ALIGN 64 FN_PREFIX(cnv1_mainloop_sandybridge_asm): @@ -51,6 +58,31 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): #include "../cnv2_double_main_loop_sandybridge.inc" ret 0 +ALIGN 64 +FN_PREFIX(cn_fast2_mainloop_ivybridge_asm): + #include "../cn_fast2_main_loop_ivybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_fast2_mainloop_ryzen_asm): + #include "../cn_fast2_main_loop_ryzen.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_fast2_mainloop_bulldozer_asm): + #include "../cn_fast2_main_loop_bulldozer.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_fast2_double_mainloop_sandybridge_asm): + #include "../cn_fast2_double_main_loop_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): + #include "../cn_liteupx_mainloop_sandybridge.inc" + ret 0 + ALIGN 64 FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): #include "../cnv1_mainloop_soft_aes_sandybridge.inc" @@ -69,4 +101,15 @@ FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm): ALIGN 64 FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): #include "../cnv2_mainloop_soft_aes_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_fast2_mainloop_soft_aes_sandybridge_asm): + #include "../cn_fast2_mainloop_soft_aes_sandybridge.inc" + ret 0 + + +ALIGN 64 +FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm): + #include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc" ret 0 \ No newline at end of file diff --git a/src/default_miner_config.json b/src/default_miner_config.json index 2068e1a8..7928cbc8 100644 --- a/src/default_miner_config.json +++ b/src/default_miner_config.json @@ -4,7 +4,7 @@ "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) - "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for v5), msr, xhv, rto, xfh + "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) @@ -20,6 +20,7 @@ "safe": false, // true to safe adjust threads and av settings for current CPU "syslog": false, // use system log for output messages "reboot-cmd" : "", // command to execute to reboot the OS + "force-pow-variant" : false, // force pow variant, dont parse pow/variant from pool job "pools": [ { "url": "donate2.graef.in:80", // URL of mining server diff --git a/src/net/Client.cpp b/src/net/Client.cpp index c468114b..d2ecf074 100644 --- a/src/net/Client.cpp +++ b/src/net/Client.cpp @@ -231,27 +231,29 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code) PowVariant powVariant = Options::i()->powVariant(); - if (params.HasMember("algo")) { - std::string algo = params["algo"].GetString(); + if (!Options::i()->forcePowVariant()) { + if (params.HasMember("algo")) { + std::string algo = params["algo"].GetString(); - if (algo.find("/") != std::string::npos) { - powVariant = parseVariant(algo.substr(algo.find("/")+1)); - } - } - - if (params.HasMember("variant")) { - const rapidjson::Value &variant = params["variant"]; - - PowVariant parsedVariant = powVariant; - - if (variant.IsInt()) { - parsedVariant = parseVariant(variant.GetInt()); - } else if (variant.IsString()) { - parsedVariant = parseVariant(variant.GetString()); + if (algo.find("/") != std::string::npos) { + powVariant = parseVariant(algo.substr(algo.find("/") + 1)); + } } - if (parsedVariant != POW_AUTODETECT) { - powVariant = parsedVariant; + if (params.HasMember("variant")) { + const rapidjson::Value& variant = params["variant"]; + + PowVariant parsedVariant = powVariant; + + if (variant.IsInt()) { + parsedVariant = parseVariant(variant.GetInt()); + } else if (variant.IsString()) { + parsedVariant = parseVariant(variant.GetString()); + } + + if (parsedVariant != POW_AUTODETECT) { + powVariant = parsedVariant; + } } } diff --git a/src/net/Job.cpp b/src/net/Job.cpp index bbf08088..dbaf5be9 100644 --- a/src/net/Job.cpp +++ b/src/net/Job.cpp @@ -146,6 +146,10 @@ PowVariant Job::powVariant() const } else { return PowVariant::POW_V0; } + } else if (m_powVariant == PowVariant::POW_XTL) { + if (m_blob[0] > 5) { + return PowVariant::POW_XTL_V9; + } } else { return m_powVariant; } diff --git a/src/version.h b/src/version.h index 7bc9dc01..d80e09d4 100644 --- a/src/version.h +++ b/src/version.h @@ -36,14 +36,14 @@ #define APP_DESC "XMRigCC CPU miner" #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id" #endif -#define APP_VERSION "1.8.7 (based on XMRig)" +#define APP_VERSION "1.8.8 (based on XMRig)" #define APP_DOMAIN "" #define APP_SITE "https://github.com/Bendr0id/xmrigCC" #define APP_KIND "cpu" #define APP_VER_MAJOR 1 #define APP_VER_MINOR 8 -#define APP_VER_BUILD 7 +#define APP_VER_BUILD 8 #define APP_VER_REV 0 #ifndef NDEBUG