diff --git a/CHANGELOG.md b/CHANGELOG.md index 7e46109d..c1065883 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,5 @@ +# 1.8.9 +- Added cn-ultralite algo used by upcoming TurtleV2 fork (algo: "cryptonight-ultralite", variant "auto") # 1.8.8 - Added XLT v5/9 with autodetect(algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9)) - Added cn-lite variant UPX/uPlexa (algo: "cryptonight-lite", variant "upx") diff --git a/README.md b/README.md index 3eb25787..151b8e1b 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ Full Windows/Linux compatible, and you can mix Linux and Windows miner on one XM ## Additional features of XMRigCC (on top of XMRig) Check the [Coin Configuration](https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations) guide +* **NEW Support of Crytptonight-Ultralite TRTL/Turtle variant (algo: "cryptonight-ultralite", variant "auto")** * **NEW Support of Crytptonight-Lite UPX/uPlexa variant (algo: "cryptonight-lite", variant "upx")** * **NEW Support of Crytptonight XTL v5/v9 PoW changes aka CN-FastV2 (algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9))** * **Support of Crytptonight XFH/SWAP variant aka CN-Heavy-Fast** diff --git a/src/Cpu.cpp b/src/Cpu.cpp index 56b09abc..1605330c 100644 --- a/src/Cpu.cpp +++ b/src/Cpu.cpp @@ -67,6 +67,12 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor, size_t cache = availableCache(); size_t algoBlockSize; switch (algo) { + case Options::ALGO_CRYPTONIGHT_ULTRALITE: + algoBlockSize = 256; + break; + case Options::ALGO_CRYPTONIGHT_SUPERLITE: + algoBlockSize = 512; + break; case Options::ALGO_CRYPTONIGHT_LITE: algoBlockSize = 1024; break; @@ -81,8 +87,17 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor, size_t maximumReasonableFactor = std::max(cache / algoBlockSize, static_cast(1ul)); size_t maximumReasonableThreadCount = std::min(maximumReasonableFactor, m_totalThreads); - size_t maximumReasonableHashFactor = std::min(maximumReasonableFactor, (algo == Options::ALGO_CRYPTONIGHT_HEAVY || powVariant == POW_XFH) ? 3 : static_cast(MAX_NUM_HASH_BLOCKS)); + size_t maximumReasonableHashFactor = static_cast(MAX_NUM_HASH_BLOCKS); + if (algo == Options::ALGO_CRYPTONIGHT_HEAVY || powVariant == POW_XFH) { + maximumReasonableHashFactor = 3; + } else if (algo == Options::ALGO_CRYPTONIGHT_ULTRALITE) { + if (m_asmOptimization == ASM_INTEL) { + maximumReasonableHashFactor = 2; + } else { + maximumReasonableHashFactor = 1; + } + } if (safeMode) { if (threadsCount > maximumReasonableThreadCount) { threadsCount = maximumReasonableThreadCount; diff --git a/src/Mem.cpp b/src/Mem.cpp index 2a9b1926..1865ba36 100644 --- a/src/Mem.cpp +++ b/src/Mem.cpp @@ -40,6 +40,12 @@ ScratchPadMem Mem::create(ScratchPad** scratchPads, int threadId) size_t scratchPadSize; switch (m_algo) { + case Options::ALGO_CRYPTONIGHT_ULTRALITE: + scratchPadSize = MEMORY_ULTRA_LITE; + break; + case Options::ALGO_CRYPTONIGHT_SUPERLITE: + scratchPadSize = MEMORY_SUPER_LITE; + break; case Options::ALGO_CRYPTONIGHT_LITE: scratchPadSize = MEMORY_LITE; break; diff --git a/src/Options.cpp b/src/Options.cpp index 7efae639..707a2e5b 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -63,7 +63,7 @@ Usage: " APP_ID " [OPTIONS]\n\ Options:\n" # ifndef XMRIG_CC_SERVER "\ - -a, --algo=ALGO cryptonight (default), cryptonight-lite or cryptonight-heavy\n\ + -a, --algo=ALGO cryptonight (default), cryptonight-lite, cryptonight-ultralite or cryptonight-heavy\n\ -o, --url=URL URL of mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ -u, --user=USERNAME username for mining server\n\ @@ -73,7 +73,7 @@ Options:\n" -k, --keepalive send keepalived for prevent timeout (need pool support)\n\ -r, --retries=N number of times to retry before switch to backup server (default: 5)\n\ -R, --retry-pause=N time to pause between retries (default: 5)\n\ - --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for > v5), 'rto', 'xfh', 'upx'\n\ + --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for > v5), 'rto', 'xfh', 'upx', 'turtle'\n\ for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\ --asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\ --multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\ @@ -303,12 +303,16 @@ static struct option const cc_server_options[] = { static const char *algo_names[] = { "cryptonight", "cryptonight-lite", + "cryptonight-superlite", + "cryptonight-ultralite", "cryptonight-heavy" }; static const char *algo_short_names[] = { "cn", "cn-lite", + "cn-superlite", + "cn-ultralite", "cn-heavy" }; @@ -325,7 +329,8 @@ constexpr static const char *pow_variant_names[] = { "rto", "xfh", "xtlv9", - "upx" + "upx", + "turtle" }; constexpr static const char *asm_optimization_names[] = { @@ -1086,6 +1091,17 @@ bool Options::setAlgo(const char *algo) break; } + if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cn-super-lite") || !strcmp(algo, "cryptonight-super-lite") || !strcmp(algo, "cryptonight-superlight"))) { + m_algo = ALGO_CRYPTONIGHT_SUPERLITE; + break; + } + + + if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cn-ultra-lite") || !strcmp(algo, "cryptonight-ultra-lite") || !strcmp(algo, "cryptonight-ultralight"))) { + m_algo = ALGO_CRYPTONIGHT_ULTRALITE; + break; + } + if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cryptonight-lite-ipbc") || !strcmp(algo, "cryptonight-light-ipbc") || !strcmp(algo, "cn-lite-ipbc"))) { showDeprecateWarning("cryptonight-light-ipbc", "cryptonight-light (with variant \"ipbc\")"); m_algo = ALGO_CRYPTONIGHT_LITE; @@ -1160,7 +1176,7 @@ bool Options::parsePowVariant(const char *powVariant) break; } - if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "stellitev9")) { + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2"))) { m_powVariant = POW_XTL_V9; break; } @@ -1170,6 +1186,11 @@ bool Options::parsePowVariant(const char *powVariant) break; } + if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "trtl")) { + m_powVariant = POW_TURTLE; + break; + } + if (i == ARRAY_SIZE(pow_variant_names) - 1) { showUsage(1); return false; diff --git a/src/Options.h b/src/Options.h index 7d4c1309..8c3e730a 100644 --- a/src/Options.h +++ b/src/Options.h @@ -46,7 +46,9 @@ public: enum Algo { ALGO_CRYPTONIGHT, /* CryptoNight (2MB ScratchPad) */ ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (1MB ScratchPad) */ - ALGO_CRYPTONIGHT_HEAVY /* CryptoNight-Heavy (4MB ScratchPad) */ + ALGO_CRYPTONIGHT_SUPERLITE, /* CryptoNight-Superlite (512KB ScratchPad) */ + ALGO_CRYPTONIGHT_ULTRALITE, /* CryptoNight-Ultralite (256KB ScratchPad) */ + ALGO_CRYPTONIGHT_HEAVY, /* CryptoNight-Heavy (4MB ScratchPad) */ }; enum AlgoVariant { diff --git a/src/PowVariant.h b/src/PowVariant.h index dfb3c223..af2d59e4 100644 --- a/src/PowVariant.h +++ b/src/PowVariant.h @@ -37,6 +37,7 @@ enum PowVariant POW_XFH, POW_XTL_V9, POW_UPX, + POW_TURTLE, LAST_ITEM }; @@ -68,6 +69,8 @@ inline std::string getPowVariantName(PowVariant powVariant) return "xtlv9"; case POW_UPX: return "upx"; + case POW_TURTLE: + return "turtle"; case POW_AUTODETECT: default: return "-1"; @@ -135,10 +138,12 @@ inline PowVariant parseVariant(const std::string variant) powVariant = PowVariant::POW_RTO; } else if (variant == "xfh" || variant == "freehaven" || variant == "faven") { powVariant = PowVariant::POW_XFH; - } else if (variant == "xtlv9" || variant == "stellite_v9") { + } else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2") { powVariant = PowVariant::POW_XTL_V9; } else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") { powVariant = PowVariant::POW_UPX; + } else if (variant == "turtle" || variant == "trtl") { + powVariant = PowVariant::POW_TURTLE; } return powVariant; diff --git a/src/config.json b/src/config.json index 5e8e6eaa..431e197f 100644 --- a/src/config.json +++ b/src/config.json @@ -1,10 +1,10 @@ { - "algo": "cryptonight", // cryptonight (default), cryptonight-lite or cryptonight-heavy + "algo": "cryptonight", // cryptonight (default), cryptonight-lite, cryptonight-ultralite or cryptonight-heavy "aesni": 0, // selection of AES-NI mode (0 auto, 1 on, 2 off) "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) - "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx + "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx, turtle // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index c31f28ba..e387bf15 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -77,7 +77,7 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } @@ -129,7 +129,7 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } @@ -225,6 +225,44 @@ static void cryptonight_lite_softaes(AsmOptimization asmOptimization, PowVariant } } +template +static void cryptonight_super_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { + +} + +template +static void cryptonight_super_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { + +} + +template +static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || + (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || + (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif +} + +template +static void cryptonight_ultra_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +#if defined(XMRIG_ARM) + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); +#else + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); + } +#endif +} + template static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) @@ -275,6 +313,22 @@ void setCryptoNightHashMethods(Options::Algo algo, bool aesni) } break; + case Options::ALGO_CRYPTONIGHT_SUPERLITE: + if (aesni) { + cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_super_lite_aesni; + } else { + cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_super_lite_softaes; + } + break; + + case Options::ALGO_CRYPTONIGHT_ULTRALITE: + if (aesni) { + cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_ultra_lite_aesni; + } else { + cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_ultra_lite_softaes; + } + break; + case Options::ALGO_CRYPTONIGHT_HEAVY: if (aesni) { cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_heavy_aesni; @@ -316,19 +370,20 @@ void CryptoNight::hash(size_t factor, AsmOptimization asmOptimization, PowVarian bool CryptoNight::selfTest(int algo) { if (cryptonight_hash_ctx[0] == nullptr -#if MAX_NUM_HASH_BLOCKS > 1 + #if MAX_NUM_HASH_BLOCKS > 1 || cryptonight_hash_ctx[1] == nullptr -#endif -#if MAX_NUM_HASH_BLOCKS > 2 + #endif + #if MAX_NUM_HASH_BLOCKS > 2 || cryptonight_hash_ctx[2] == nullptr -#endif -#if MAX_NUM_HASH_BLOCKS > 3 + #endif + #if MAX_NUM_HASH_BLOCKS > 3 || cryptonight_hash_ctx[3] == nullptr -#endif -#if MAX_NUM_HASH_BLOCKS > 4 + #endif + #if MAX_NUM_HASH_BLOCKS > 4 || cryptonight_hash_ctx[4] == nullptr -#endif - ) { + #endif + ) + { return false; } @@ -345,6 +400,8 @@ bool CryptoNight::selfTest(int algo) bool result = true; bool resultLite = true; + bool resultSuperLite = true; + bool resultUltraLite = true; bool resultHeavy = true; AsmOptimization asmOptimization = Options::i()->asmOptimization(); @@ -476,6 +533,17 @@ bool CryptoNight::selfTest(int algo) cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_UPX, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0; + + } else if (algo == Options::ALGO_CRYPTONIGHT_SUPERLITE) { + + return false; + + } else if (algo == Options::ALGO_CRYPTONIGHT_ULTRALITE) { + // cn ultralite (cnv8 + turtle) + + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); + resultUltraLite = resultUltraLite && memcmp(output, test_output_turtle, 32) == 0; + } else { // cn v0 aka orignal @@ -527,7 +595,7 @@ bool CryptoNight::selfTest(int algo) result = result && memcmp(output, test_output_v1, 160) == 0; #endif - // cn v7 + xtl + // cnv7 + xtl cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 32) == 0; @@ -542,7 +610,7 @@ bool CryptoNight::selfTest(int algo) cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_ALLOY,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_alloy, 32) == 0; - // cn v8 aka cnv2 + // cnv8 aka cnv2 cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v2, 32) == 0; @@ -583,5 +651,5 @@ bool CryptoNight::selfTest(int algo) _mm_free(scratchPads[i]); } - return result && resultLite & resultHeavy; + return result && resultLite && resultSuperLite && resultUltraLite && resultHeavy; } \ No newline at end of file diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h index 259e365f..9d8f2cca 100644 --- a/src/crypto/CryptoNight.h +++ b/src/crypto/CryptoNight.h @@ -33,6 +33,8 @@ #define MEMORY 2097152 /* 2 MiB */ #define MEMORY_LITE 1048576 /* 1 MiB */ +#define MEMORY_SUPER_LITE 524288 /* 512 KiB */ +#define MEMORY_ULTRA_LITE 262144 /* 256 KiB */ #define MEMORY_HEAVY 4194304 /* 4 MiB */ #define POW_DEFAULT_INDEX_SHIFT 3 diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index d2f2634d..b9a940d7 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -212,4 +212,10 @@ const static uint8_t test_output_heavy_tube[96] = { 0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb }; +// CN-Ultralite/Turtle +const static uint8_t test_output_turtle[32] = { + 0x08, 0xF4, 0x21, 0xD7, 0x83, 0x31, 0x17, 0x30, 0x0E, 0xDA, 0x66, 0xE9, 0x8F, 0x4A, 0x25, 0x69, + 0x09, 0x3D, 0xF3, 0x00, 0x50, 0x01, 0x73, 0x94, 0x4E, 0xFC, 0x40, 0x1E, 0x9A, 0x4A, 0x17, 0xAF +}; + #endif /* __CRYPTONIGHT_TEST_H__ */ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index 82fe8e40..a52a28f4 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -62,12 +62,17 @@ extern "C" void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0); void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0); + void cn_ultralitev2_mainloop_ivybridge_asm(ScratchPad* ctx0); + void cn_ultralitev2_mainloop_ryzen_asm(ScratchPad* ctx0); + void cn_ultralitev2_mainloop_bulldozer_asm(ScratchPad* ctx0); + void cn_ultralitev2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx); #endif } @@ -775,8 +780,7 @@ public: uint64_t* h[NUM_HASH_BLOCKS]; uint64_t al[NUM_HASH_BLOCKS]; uint64_t ah[NUM_HASH_BLOCKS]; - uint64_t idx[NUM_HASH_BLOCKS];CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube( - input, size, output, scratchPad); + uint64_t idx[NUM_HASH_BLOCKS]; uint64_t sqrt_result[NUM_HASH_BLOCKS]; __m128i bx0[NUM_HASH_BLOCKS]; __m128i bx1[NUM_HASH_BLOCKS]; @@ -883,15 +887,6 @@ public: // not supported } - inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization) - { - // not supported - } - inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -1564,14 +1559,38 @@ public: if (SOFT_AES) { scratchPad[0]->input = input; scratchPad[0]->t_fn = (const uint32_t*)saes_table; - cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + if (ITERATIONS == 0x40000) { + cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } else if (ITERATIONS == 0x10000) { + cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } else { + cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } } else { - cnv2_mainloop_ivybridge_asm(scratchPad[0]); + if (ITERATIONS == 0x10000) { + cn_ultralitev2_mainloop_ivybridge_asm(scratchPad[0]); + } else if (ITERATIONS == 0x40000) { + cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]); + } else { + cnv2_mainloop_ivybridge_asm(scratchPad[0]); + } } } else if (asmOptimization == AsmOptimization::ASM_RYZEN) { - cnv2_mainloop_ryzen_asm(scratchPad[0]); + if (ITERATIONS == 0x10000) { + cn_ultralitev2_mainloop_ryzen_asm(scratchPad[0]); + } else if (ITERATIONS == 0x40000) { + cn_fastv2_mainloop_ryzen_asm(scratchPad[0]); + } else { + cnv2_mainloop_ryzen_asm(scratchPad[0]); + } } else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) { - cnv2_mainloop_bulldozer_asm(scratchPad[0]); + if (ITERATIONS == 0x10000) { + cn_ultralitev2_mainloop_bulldozer_asm(scratchPad[0]); + } else if (ITERATIONS == 0x40000) { + cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]); + } else { + cnv2_mainloop_bulldozer_asm(scratchPad[0]); + } } #endif @@ -1580,39 +1599,6 @@ public: extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } - // single asm - inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization) - { - const uint8_t* l = scratchPad[0]->memory; - uint64_t* h = reinterpret_cast(scratchPad[0]->state); - - keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - cn_explode_scratchpad((__m128i*) h, (__m128i*) l); - -#ifndef XMRIG_NO_ASM - if (asmOptimization == AsmOptimization::ASM_INTEL) { - if (SOFT_AES) { - scratchPad[0]->input = input; - scratchPad[0]->t_fn = (const uint32_t*)saes_table; - cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); - } else { - cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]); - } - } else if (asmOptimization == AsmOptimization::ASM_RYZEN) { - cn_fastv2_mainloop_ryzen_asm(scratchPad[0]); - } else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) { - cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]); - } -#endif - - cn_implode_scratchpad((__m128i*) l, (__m128i*) h); - keccakf(h, 24); - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - } inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, @@ -2320,39 +2306,13 @@ public: cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); #ifndef XMRIG_NO_ASM - cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); -#endif - - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - - keccakf(h0, 24); - keccakf(h1, 24); - - extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); - extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); - } - - // double asm - inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization) - { - keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); - keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - - const uint8_t* l0 = scratchPad[0]->memory; - const uint8_t* l1 = scratchPad[1]->memory; - uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); - - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - -#ifndef XMRIG_NO_ASM - cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); + if (ITERATIONS == 0x10000) { + cn_ultralitev2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); + } else if (ITERATIONS == 0x40000) { + cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); + } else { + cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); + } #endif cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); @@ -3312,15 +3272,6 @@ public: // not supported } - inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization) - { - // not supported - } - inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -4576,15 +4527,6 @@ public: // not supported } - inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization) - { - // not supported - } - inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, @@ -5510,15 +5452,6 @@ public: // not supported } - inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad, - AsmOptimization asmOptimization) - { - // not supported - } - inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index e423543b..3effbbfe 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -19,6 +19,10 @@ .global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm) .global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm) +.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) @@ -26,6 +30,7 @@ .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm) #ifdef __APPLE__ ALIGN 16 @@ -173,6 +178,55 @@ FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): add rsp, 48 ret 0 +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_ultralitev2_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cn_ultralitev2_double_main_loop_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_ultralitev2_main_loop_ryzen.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_ultralitev2_main_loop_bulldozer.inc" + add rsp, 48 + ret 0 + #ifdef __APPLE__ ALIGN 16 #else @@ -244,3 +298,15 @@ FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm): #include "cn_liteupx_mainloop_soft_aes_sandybridge.inc" add rsp, 48 ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_ultralitev2_mainloop_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 \ No newline at end of file diff --git a/src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc b/src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..2cc3c4f9 --- /dev/null +++ b/src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc @@ -0,0 +1,414 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 65536 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 131056 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 131056 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +main_loop_double_ultralitev2_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 131056 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 131056 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 131056 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_ultralitev2_sandybridge +div_fix_1_ret_ultralitev2_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_ultralitev2_sandybridge +div_fix_2_ret_ultralitev2_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_ultralitev2_sandybridge +sqrt_fix_1_ret_ultralitev2_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_ultralitev2_sandybridge +sqrt_fix_2_ret_ultralitev2_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 131056 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_ultralitev2_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp + +div_fix_1_ultralitev2_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_ultralitev2_sandybridge + +div_fix_2_ultralitev2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_ultralitev2_sandybridge + +sqrt_fix_1_ultralitev2_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_ultralitev2_sandybridge + +sqrt_fix_2_ultralitev2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_ultralitev2_sandybridge + +cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp: diff --git a/src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc b/src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc new file mode 100644 index 00000000..403cff04 --- /dev/null +++ b/src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 65536 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 131056 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_ultralitev2_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movq r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 131056 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movq xmm0, rax + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ultralitev2_bulldozer + shr rdi, 19 + +sqrt_fixup_ultralitev2_bulldozer_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 131056 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_ultralitev2_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ultralitev2_bulldozer_endp + +sqrt_fixup_ultralitev2_bulldozer: + movq r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ultralitev2_bulldozer_ret + +cnv2_main_loop_ultralitev2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc b/src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc new file mode 100644 index 00000000..b0488836 --- /dev/null +++ b/src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 65536 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 131056 + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +$main_loop_ultralitev2_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 131056 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test edx, 524287 + je $sqrt_fixup_ultralitev2_ivybridge + psrlq xmm3, 19 +$sqrt_fixup_ultralitev2_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 131056 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne $main_loop_ultralitev2_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp $cnv2_main_loop_ultralitev2_ivybridge_endp + +$sqrt_fixup_ultralitev2_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp $sqrt_fixup_ultralitev2_ivybridge_ret + +$cnv2_main_loop_ultralitev2_ivybridge_endp: diff --git a/src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc b/src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc new file mode 100644 index 00000000..e50ff9e9 --- /dev/null +++ b/src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc @@ -0,0 +1,183 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 65536 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 131056 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +$main_loop_ultralitev2_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 131056 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je $sqrt_fixup_ultralitev2_ryzen + shr rdi, 19 + +$sqrt_fixup_ultralitev2_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 131056 + movdqa xmm3, xmm5 + dec ebp + jne $main_loop_ultralitev2_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp $cnv2_main_loop_ultralitev2_ryzen_endp + +$sqrt_fixup_ultralitev2_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp $sqrt_fixup_ultralitev2_ryzen_ret + +$cnv2_main_loop_ultralitev2_ryzen_endp: diff --git a/src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..8ab8a060 --- /dev/null +++ b/src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,271 @@ + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 152 + + stmxcsr DWORD PTR [rsp+4] + mov DWORD PTR [rsp], 24448 + ldmxcsr DWORD PTR [rsp] + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 131056 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+248], rax + movq xmm12, r11 + mov QWORD PTR [rsp+240], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 262144 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cnv2_mainloop_soft_aes_ultralitev2_sandybridge: + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+248] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 131056 + xor edx, edx + mov rax, rcx + shl rax, 32 + movq rbx, xmm10 + xor rbx, rax + lea r9, QWORD PTR [rcx+rcx] + add r9d, edi + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov ecx, -2147483647 + movdqu XMMWORD PTR [r13], xmm0 + or r9, rcx + movdqa xmm0, xmm6 + movaps xmm1, xmm9 + psrldq xmm0, 8 + movq rax, xmm0 + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + div r9 + shl rdx, 32 + mov eax, eax + add rdx, rax + lea r9, QWORD PTR [rdx+rdi] + movq xmm10, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm1, xmm0 + movq rdx, xmm1 + test rdx, 524287 + je sqrt_fixup_soft_aes_ultralitev2_sandybridge + psrlq xmm1, 19 +sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret: + + mov r9, r10 + movdqa xmm13, xmm1 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movq xmm0, rax + movq xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+240] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+224] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 131056 + xor r9, rbp + mov QWORD PTR [rsp+240], r9 + mov QWORD PTR [rsp+248], rax + sub r12d, 1 + jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge + + ldmxcsr DWORD PTR [rsp+4] + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 152 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp + +sqrt_fixup_soft_aes_ultralitev2_sandybridge: + dec rdx + mov r15d, -1022 + shl r15, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + lea rcx, [rcx+r15+1] + add rax, r15 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm1, rdx + jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret + +cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp: diff --git a/src/crypto/asm/win/cn_main_loop.asm b/src/crypto/asm/win/cn_main_loop.asm index 54943f1c..b9635a36 100644 --- a/src/crypto/asm/win/cn_main_loop.asm +++ b/src/crypto/asm/win/cn_main_loop.asm @@ -11,6 +11,10 @@ PUBLIC cn_fastv2_mainloop_ryzen_asm PUBLIC cn_fastv2_mainloop_bulldozer_asm PUBLIC cn_fastv2_double_mainloop_sandybridge_asm PUBLIC cn_liteupx_mainloop_sandybridge_asm +PUBLIC cn_ultralitev2_mainloop_ivybridge_asm +PUBLIC cn_ultralitev2_mainloop_ryzen_asm +PUBLIC cn_ultralitev2_mainloop_bulldozer_asm +PUBLIC cn_ultralitev2_double_mainloop_sandybridge_asm PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm @@ -18,6 +22,7 @@ PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm PUBLIC cn_fastv2_mainloop_soft_aes_sandybridge_asm PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm +PUBLIC cn_ultralitev2_mainloop_soft_aes_sandybridge_asm ALIGN 64 cnv1_mainloop_sandybridge_asm PROC @@ -91,6 +96,30 @@ cn_liteupx_mainloop_sandybridge_asm PROC ret 0 cn_liteupx_mainloop_sandybridge_asm ENDP +ALIGN 64 +cn_ultralitev2_mainloop_ivybridge_asm PROC + INCLUDE cn_ultralitev2_main_loop_ivybridge.inc + ret 0 +cn_ultralitev2_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cn_ultralitev2_mainloop_ryzen_asm PROC + INCLUDE cn_ultralitev2_main_loop_ryzen.inc + ret 0 +cn_ultralitev2_mainloop_ryzen_asm ENDP + +ALIGN 64 +cn_ultralitev2_mainloop_bulldozer_asm PROC + INCLUDE cn_ultralitev2_main_loop_bulldozer.inc + ret 0 +cn_ultralitev2_mainloop_bulldozer_asm ENDP + +ALIGN 64 +cn_ultralitev2_double_mainloop_sandybridge_asm PROC + INCLUDE cn_ultralitev2_double_main_loop_sandybridge.inc + ret 0 +cn_ultralitev2_double_mainloop_sandybridge_asm ENDP + ALIGN 64 cnv1_mainloop_soft_aes_sandybridge_asm PROC INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc @@ -127,5 +156,11 @@ cn_liteupx_mainloop_soft_aes_sandybridge_asm PROC ret 0 cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP +ALIGN 64 +cn_ultralitev2_mainloop_soft_aes_sandybridge_asm PROC + INCLUDE cn_ultralitev2_mainloop_soft_aes_sandybridge.inc + ret 0 +cn_ultralitev2_mainloop_soft_aes_sandybridge_asm ENDP + _TEXT_CN_MAINLOOP ENDS END \ No newline at end of file diff --git a/src/crypto/asm/win/cn_main_loop_win_gcc.S b/src/crypto/asm/win/cn_main_loop_win_gcc.S index 1b815ccc..159937ef 100644 --- a/src/crypto/asm/win/cn_main_loop_win_gcc.S +++ b/src/crypto/asm/win/cn_main_loop_win_gcc.S @@ -15,6 +15,10 @@ .global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm) .global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm) +.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) @@ -22,6 +26,7 @@ .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm) ALIGN 64 FN_PREFIX(cnv1_mainloop_sandybridge_asm): @@ -83,6 +88,26 @@ FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): #include "../cn_liteupx_mainloop_sandybridge.inc" ret 0 +ALIGN 64 +FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm): + #include "../cn_ultralitev2_main_loop_ivybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm): + #include "../cn_ultralitev2_main_loop_ryzen.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm): + #include "../cn_ultralitev2_main_loop_bulldozer.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm): + #include "../cn_ultralitev2_double_main_loop_sandybridge.inc" + ret 0 + ALIGN 64 FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): #include "../cnv1_mainloop_soft_aes_sandybridge.inc" @@ -108,8 +133,12 @@ FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm): #include "../cn_fastv2_mainloop_soft_aes_sandybridge.inc" ret 0 - ALIGN 64 FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm): #include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc" - ret 0 \ No newline at end of file + ret 0 + +ALIGN 64 +FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm): + #include "../cn_ultralitev2_mainloop_soft_aes_sandybridge.inc" + ret 0 \ No newline at end of file diff --git a/src/crypto/asm/win/cn_ultralitev2_double_main_loop_sandybridge.inc b/src/crypto/asm/win/cn_ultralitev2_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..30d6e814 --- /dev/null +++ b/src/crypto/asm/win/cn_ultralitev2_double_main_loop_sandybridge.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 65536 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 131056 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 131056 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 64 +main_loop_double_ultralitev2_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 131056 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 131056 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 131056 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_ultralitev2_sandybridge +div_fix_1_ret_ultralitev2_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_ultralitev2_sandybridge +div_fix_2_ret_ultralitev2_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_ultralitev2_sandybridge +sqrt_fix_1_ret_ultralitev2_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_ultralitev2_sandybridge +sqrt_fix_2_ret_ultralitev2_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 131056 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_ultralitev2_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp + +div_fix_1_ultralitev2_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_ultralitev2_sandybridge + +div_fix_2_ultralitev2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_ultralitev2_sandybridge + +sqrt_fix_1_ultralitev2_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_ultralitev2_sandybridge + +sqrt_fix_2_ultralitev2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_ultralitev2_sandybridge + +cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp: diff --git a/src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc b/src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc new file mode 100644 index 00000000..311f6fa3 --- /dev/null +++ b/src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 65536 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 131056 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_ultralitev2_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movd r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 131056 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movd xmm0, rax + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ultralitev2_bulldozer + shr rdi, 19 + +sqrt_fixup_ultralitev2_bulldozer_ret: + mov rax, rsi + mul r14 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 131056 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_ultralitev2_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ultralitev2_bulldozer_endp + +sqrt_fixup_ultralitev2_bulldozer: + movd r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ultralitev2_bulldozer_ret + +cnv2_main_loop_ultralitev2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc b/src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc new file mode 100644 index 00000000..d2295c9a --- /dev/null +++ b/src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc @@ -0,0 +1,182 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 65536 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 131056 + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 64 +$main_loop_ultralitev2_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 131056 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + movq rdx, xmm3 + test edx, 524287 + je $sqrt_fixup_ultralitev2_ivybridge + psrlq xmm3, 19 +$sqrt_fixup_ultralitev2_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 131056 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne $main_loop_ultralitev2_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp $cnv2_main_loop_ultralitev2_ivybridge_endp + +$sqrt_fixup_ultralitev2_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp $sqrt_fixup_ultralitev2_ivybridge_ret + +$cnv2_main_loop_ultralitev2_ivybridge_endp: diff --git a/src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc b/src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc new file mode 100644 index 00000000..d3ef878a --- /dev/null +++ b/src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 65536 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 131056 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 64 +$main_loop_ultralitev2_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 131056 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je $sqrt_fixup_ultralitev2_ryzen + shr rdi, 19 + +$sqrt_fixup_ultralitev2_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 131056 + movdqa xmm3, xmm5 + dec ebp + jne $main_loop_ultralitev2_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp $cnv2_main_loop_ultralitev2_ryzen_endp + +$sqrt_fixup_ultralitev2_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp $sqrt_fixup_ultralitev2_ryzen_ret + +$cnv2_main_loop_ultralitev2_ryzen_endp: diff --git a/src/crypto/asm/win/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..7025a29e --- /dev/null +++ b/src/crypto/asm/win/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,267 @@ + mov QWORD PTR [rsp+8], rcx + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 152 + + stmxcsr DWORD PTR [rsp+4] + mov DWORD PTR [rsp], 24448 + ldmxcsr DWORD PTR [rsp] + + mov rax, QWORD PTR [rcx+48] + mov r10, rcx + xor rax, QWORD PTR [rcx+16] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r9, QWORD PTR [rcx+40] + xor r9, QWORD PTR [rcx+8] + movq xmm4, rax + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r11, QWORD PTR [rcx+224] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r10+72] + mov rax, QWORD PTR [r10+80] + movq xmm0, rdx + xor rax, QWORD PTR [r10+64] + + movaps XMMWORD PTR [rsp+16], xmm6 + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+48], xmm8 + movaps XMMWORD PTR [rsp+64], xmm9 + movaps XMMWORD PTR [rsp+80], xmm10 + movaps XMMWORD PTR [rsp+96], xmm11 + movaps XMMWORD PTR [rsp+112], xmm12 + movaps XMMWORD PTR [rsp+128], xmm13 + + movq xmm5, rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + + mov rax, r8 + punpcklqdq xmm4, xmm0 + and eax, 131056 + movq xmm10, QWORD PTR [r10+96] + movq xmm0, rcx + mov rcx, QWORD PTR [r10+104] + xorps xmm9, xmm9 + mov QWORD PTR [rsp+248], rax + movq xmm12, r11 + mov QWORD PTR [rsp+240], r9 + punpcklqdq xmm5, xmm0 + movq xmm13, rcx + mov r12d, 65536 + + ALIGN 64 +cnv2_mainloop_soft_aes_ultralitev2_sandybridge: + movd xmm11, r12d + mov r12, QWORD PTR [r10+272] + lea r13, QWORD PTR [rax+r11] + mov esi, DWORD PTR [r13] + movq xmm0, r9 + mov r10d, DWORD PTR [r13+4] + movq xmm7, r8 + mov ebp, DWORD PTR [r13+12] + mov r14d, DWORD PTR [r13+8] + mov rdx, QWORD PTR [rsp+248] + movzx ecx, sil + shr esi, 8 + punpcklqdq xmm7, xmm0 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + movd xmm1, r11d + add ebp, 256 + movq r11, xmm12 + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + xor eax, DWORD PTR [r12+rcx*4] + mov rcx, rdx + xor eax, r15d + punpckldq xmm2, xmm1 + xor rcx, 16 + movd xmm6, eax + mov rax, rdx + punpckldq xmm6, xmm0 + xor rax, 32 + punpckldq xmm6, xmm2 + xor rdx, 48 + movdqu xmm2, XMMWORD PTR [rcx+r11] + pxor xmm6, xmm7 + paddq xmm2, xmm4 + movdqu xmm1, XMMWORD PTR [rax+r11] + movdqu xmm0, XMMWORD PTR [rdx+r11] + paddq xmm0, xmm5 + movdqu XMMWORD PTR [rcx+r11], xmm0 + movdqu XMMWORD PTR [rax+r11], xmm2 + movq rcx, xmm13 + paddq xmm1, xmm7 + movdqu XMMWORD PTR [rdx+r11], xmm1 + movq rdi, xmm6 + mov r10, rdi + and r10d, 131056 + xor edx, edx + mov rax, rcx + shl rax, 32 + movq rbx, xmm10 + xor rbx, rax + lea r9, QWORD PTR [rcx+rcx] + add r9d, edi + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + mov ecx, -2147483647 + movdqu XMMWORD PTR [r13], xmm0 + or r9, rcx + movdqa xmm0, xmm6 + movaps xmm1, xmm9 + psrldq xmm0, 8 + movq rax, xmm0 + xor rbx, QWORD PTR [r10+r11] + lea r14, QWORD PTR [r10+r11] + mov rbp, QWORD PTR [r14+8] + div r9 + shl rdx, 32 + mov eax, eax + add rdx, rax + lea r9, QWORD PTR [rdx+rdi] + movq xmm10, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm1, xmm0 + movq rdx, xmm1 + test rdx, 524287 + je sqrt_fixup_soft_aes_ultralitev2_sandybridge + psrlq xmm1, 19 +sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret: + + mov r9, r10 + movdqa xmm13, xmm1 + xor r9, 16 + mov rcx, r10 + xor rcx, 32 + xor r10, 48 + mov rax, rbx + mul rdi + movdqu xmm2, XMMWORD PTR [r9+r11] + movdqu xmm1, XMMWORD PTR [rcx+r11] + paddq xmm1, xmm7 + movq xmm0, rax + movq xmm3, rdx + xor rax, QWORD PTR [r11+rcx+8] + xor rdx, QWORD PTR [rcx+r11] + punpcklqdq xmm3, xmm0 + add r8, rdx + movdqu xmm0, XMMWORD PTR [r10+r11] + pxor xmm2, xmm3 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [r9+r11], xmm0 + movdqa xmm5, xmm4 + mov r9, QWORD PTR [rsp+240] + movdqa xmm4, xmm6 + add r9, rax + movdqu XMMWORD PTR [rcx+r11], xmm2 + movdqu XMMWORD PTR [r10+r11], xmm1 + mov r10, QWORD PTR [rsp+224] + movd r12d, xmm11 + mov QWORD PTR [r14], r8 + xor r8, rbx + mov rax, r8 + mov QWORD PTR [r14+8], r9 + and eax, 131056 + xor r9, rbp + mov QWORD PTR [rsp+240], r9 + mov QWORD PTR [rsp+248], rax + sub r12d, 1 + jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge + + ldmxcsr DWORD PTR [rsp+4] + movaps xmm6, XMMWORD PTR [rsp+16] + movaps xmm7, XMMWORD PTR [rsp+32] + movaps xmm8, XMMWORD PTR [rsp+48] + movaps xmm9, XMMWORD PTR [rsp+64] + movaps xmm10, XMMWORD PTR [rsp+80] + movaps xmm11, XMMWORD PTR [rsp+96] + movaps xmm12, XMMWORD PTR [rsp+112] + movaps xmm13, XMMWORD PTR [rsp+128] + + add rsp, 152 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp + +sqrt_fixup_soft_aes_ultralitev2_sandybridge: + dec rdx + mov r15d, -1022 + shl r15, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + lea rcx, [rcx+r15+1] + add rax, r15 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm1, rdx + jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret + +cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp: diff --git a/src/default_miner_config.json b/src/default_miner_config.json index 7928cbc8..dc99a3d2 100644 --- a/src/default_miner_config.json +++ b/src/default_miner_config.json @@ -1,10 +1,10 @@ { - "algo": "cryptonight", // cryptonight (default), cryptonight-lite or cryptonight-heavy + "algo": "cryptonight", // cryptonight (default), cryptonight-lite, cryptonight-ultralite or cryptonight-heavy "aesni": 0, // selection of AES-NI mode (0 auto, 1 on, 2 off) "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) - "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx + "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx, turtle // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) diff --git a/src/net/Job.cpp b/src/net/Job.cpp index dbaf5be9..44fc21e8 100644 --- a/src/net/Job.cpp +++ b/src/net/Job.cpp @@ -138,6 +138,10 @@ bool Job::setTarget(const char *target) PowVariant Job::powVariant() const { + if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) { + return PowVariant::POW_V2; + } + if (m_powVariant == PowVariant::POW_AUTODETECT) { if (m_blob[0] > 7) { return PowVariant::POW_V2; diff --git a/src/net/strategies/DonateStrategy.cpp b/src/net/strategies/DonateStrategy.cpp index bcd059b3..b3313a5a 100644 --- a/src/net/strategies/DonateStrategy.cpp +++ b/src/net/strategies/DonateStrategy.cpp @@ -60,6 +60,8 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) : url = new Url("donate2.graef.in", 8443, userId, nullptr, true, false, true); } else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) { url = new Url("donate2.graef.in", 1080, userId, nullptr, true, false, true); + } else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) { + url = new Url("donate2.graef.in", 8090, userId, nullptr, true, false, true); } else { url = new Url("donate2.graef.in", 443, userId, nullptr, true, false, true); } @@ -68,6 +70,8 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) : url = new Url("donate.graef.in", 8443, userId, nullptr, false, false, true); } else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) { url = new Url("donate.graef.in", 1080, userId, nullptr, false, false, true); + } else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) { + url = new Url("donate2.graef.in", 8088, userId, nullptr, false, false, true); } else { url = new Url("donate2.graef.in", 80, userId, nullptr, false, false, true); } diff --git a/src/version.h b/src/version.h index d80e09d4..bd49d5ee 100644 --- a/src/version.h +++ b/src/version.h @@ -36,14 +36,14 @@ #define APP_DESC "XMRigCC CPU miner" #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id" #endif -#define APP_VERSION "1.8.8 (based on XMRig)" +#define APP_VERSION "1.8.9 (based on XMRig)" #define APP_DOMAIN "" #define APP_SITE "https://github.com/Bendr0id/xmrigCC" #define APP_KIND "cpu" #define APP_VER_MAJOR 1 #define APP_VER_MINOR 8 -#define APP_VER_BUILD 8 +#define APP_VER_BUILD 9 #define APP_VER_REV 0 #ifndef NDEBUG