Integrated new Algos (#224)
- Added XLT v5/9 with autodetect(algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9)) - Added cn-lite variant UPX/uPlexa (algo: "cryptonight-lite", variant "upx") - Added force-pow-variant parameter to force usage of the variant from the config and skip parsing of pow/variant from job/pool
This commit is contained in:
parent
d36797d696
commit
1273e45e46
30 changed files with 3372 additions and 52 deletions
|
@ -1,3 +1,7 @@
|
|||
# 1.8.8
|
||||
- Added XLT v5/9 with autodetect(algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9))
|
||||
- Added cn-lite variant UPX/uPlexa (algo: "cryptonight-lite", variant "upx")
|
||||
- Added force-pow-variant parameter to force usage of the variant from the config and skip parsing of pow/variant from job/pool
|
||||
# 1.8.7
|
||||
- Implemented Template based mass config editor to simple swap configs on your rigs
|
||||
# 1.8.6
|
||||
|
|
|
@ -73,7 +73,7 @@ Options:\n"
|
|||
-k, --keepalive send keepalived for prevent timeout (need pool support)\n\
|
||||
-r, --retries=N number of times to retry before switch to backup server (default: 5)\n\
|
||||
-R, --retry-pause=N time to pause between retries (default: 5)\n\
|
||||
--pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for v5), 'rto', 'xfh'\n\
|
||||
--pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for > v5), 'rto', 'xfh', 'upx'\n\
|
||||
for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\
|
||||
--asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\
|
||||
--multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\
|
||||
|
@ -90,7 +90,9 @@ Options:\n"
|
|||
--print-time=N print hashrate report every N seconds\n\
|
||||
--api-port=N port for the miner API\n\
|
||||
--api-access-token=T access token for API\n\
|
||||
--api-worker-id=ID custom worker-id for API\n"
|
||||
--api-worker-id=ID custom worker-id for API\n\
|
||||
--reboot-cmd command/bat to execute to Reboot miner\n\
|
||||
--force-pow-variant disable pow/variant parsing from pool\n"
|
||||
# ifndef XMRIG_NO_CC
|
||||
"\
|
||||
--cc-url=URL url of the CC Server\n\
|
||||
|
@ -99,8 +101,7 @@ Options:\n"
|
|||
--cc-worker-id=ID custom worker-id for CC Server\n\
|
||||
--cc-update-interval-s=N status update interval in seconds (default: 10 min: 1)\n\
|
||||
--cc-use-remote-logging enable remote logging on CC Server\n\
|
||||
--cc-upload-config-on-startup upload current miner config to CC Server on startup\n\
|
||||
--cc-reboot-cmd command/bat to execute to Reboot miner\n"
|
||||
--cc-upload-config-on-startup upload current miner config to CC Server on startup\n"
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
@ -175,8 +176,9 @@ static struct option const options[] = {
|
|||
{ "userpass", 1, nullptr, 'O' },
|
||||
{ "version", 0, nullptr, 'V' },
|
||||
{ "use-tls", 0, nullptr, 1015 },
|
||||
{ "force-pow-version", 1, nullptr, 1016 },
|
||||
{ "pow-variant" ,1, nullptr, 1017 },
|
||||
{ "force-pow-variant", 0, nullptr, 1016 },
|
||||
{ "pow-variant", 1, nullptr, 1017 },
|
||||
{ "variant", 1, nullptr, 1017 },
|
||||
{ "api-port", 1, nullptr, 4000 },
|
||||
{ "api-access-token", 1, nullptr, 4001 },
|
||||
{ "api-worker-id", 1, nullptr, 4002 },
|
||||
|
@ -232,8 +234,9 @@ static struct option const config_options[] = {
|
|||
{ "syslog", 0, nullptr, 'S' },
|
||||
{ "threads", 1, nullptr, 't' },
|
||||
{ "user-agent", 1, nullptr, 1008 },
|
||||
{ "force-pow-version", 1, nullptr, 1016 },
|
||||
{ "force-pow-variant", 0, nullptr, 1016 },
|
||||
{ "pow-variant", 1, nullptr, 1017 },
|
||||
{ "variant", 1, nullptr, 1017 },
|
||||
{ "doublehash-thread-mask", 1, nullptr, 4013 },
|
||||
{ "multihash-thread-mask", 1, nullptr, 4013 },
|
||||
{ "asm-optimization", 1, nullptr, 4020 },
|
||||
|
@ -250,6 +253,8 @@ static struct option const pool_options[] = {
|
|||
{ "keepalive", 0, nullptr ,'k' },
|
||||
{ "nicehash", 0, nullptr, 1006 },
|
||||
{ "use-tls", 0, nullptr, 1015 },
|
||||
{ "pow-variant", 1, nullptr, 1017 },
|
||||
{ "variant", 1, nullptr, 1017 },
|
||||
{ nullptr, 0, nullptr, 0 }
|
||||
};
|
||||
|
||||
|
@ -318,7 +323,9 @@ constexpr static const char *pow_variant_names[] = {
|
|||
"msr",
|
||||
"xhv",
|
||||
"rto",
|
||||
"xfh"
|
||||
"xfh",
|
||||
"xtlv9",
|
||||
"upx"
|
||||
};
|
||||
|
||||
constexpr static const char *asm_optimization_names[] = {
|
||||
|
@ -366,6 +373,7 @@ Options::Options(int argc, char **argv) :
|
|||
m_ccPushOfflineMiners(false),
|
||||
m_ccPushPeriodicStatus(false),
|
||||
m_ccPushZeroHashrateMiners(false),
|
||||
m_forcePowVariant(false),
|
||||
m_fileName(Platform::defaultConfigName()),
|
||||
m_apiToken(nullptr),
|
||||
m_apiWorkerId(nullptr),
|
||||
|
@ -606,7 +614,6 @@ bool Options::parseArg(int key, const char *arg)
|
|||
case 1003: /* --donate-level */
|
||||
case 1004: /* --max-cpu-usage */
|
||||
case 1007: /* --print-time */
|
||||
case 1016: /* --force-pow-version */
|
||||
case 1021: /* --cpu-priority */
|
||||
case 4000: /* --api-port */
|
||||
case 4006: /* --cc-port */
|
||||
|
@ -628,7 +635,10 @@ bool Options::parseArg(int key, const char *arg)
|
|||
case 1015: /* --use-tls */
|
||||
return parseBoolean(key, true);
|
||||
|
||||
case 1017: /* --pow-variant */
|
||||
case 1016: /* --force-pow-variant */
|
||||
return parseBoolean(key, false);
|
||||
|
||||
case 1017: /* --pow-variant/--variant */
|
||||
return parsePowVariant(arg);
|
||||
|
||||
case 4016: /* --cc-use-tls */
|
||||
|
@ -803,16 +813,6 @@ bool Options::parseArg(int key, uint64_t arg)
|
|||
m_printTime = (int) arg;
|
||||
break;
|
||||
|
||||
case 1016: /* --force-pow-version */
|
||||
showDeprecateWarning("force-pow-version", "pow-variant");
|
||||
if (arg != POW_AUTODETECT && arg != POW_V0 && arg != POW_V1) {
|
||||
showUsage(1);
|
||||
return false;
|
||||
}
|
||||
|
||||
m_powVariant = static_cast<PowVariant>(arg);
|
||||
break;
|
||||
|
||||
case 1020: /* --cpu-affinity */
|
||||
if (arg) {
|
||||
m_affinity = arg;
|
||||
|
@ -901,6 +901,10 @@ bool Options::parseBoolean(int key, bool enable)
|
|||
m_pools.back()->setUseTls(enable);
|
||||
break;
|
||||
|
||||
case 1016: /* --force-pow-variant */
|
||||
m_forcePowVariant = enable;
|
||||
break;
|
||||
|
||||
case 2000: /* --colors */
|
||||
m_colors = enable;
|
||||
break;
|
||||
|
@ -1151,11 +1155,21 @@ bool Options::parsePowVariant(const char *powVariant)
|
|||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "freehaven") || !strcmp(powVariant, "faven"))) {
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "freehaven") || !strcmp(powVariant, "faven") || !strcmp(powVariant, "swap"))) {
|
||||
m_powVariant = POW_XFH;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "stellitev9")) {
|
||||
m_powVariant = POW_XTL_V9;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "uplexa")) {
|
||||
m_powVariant = POW_UPX;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1) {
|
||||
showUsage(1);
|
||||
return false;
|
||||
|
|
|
@ -80,6 +80,7 @@ public:
|
|||
inline bool ccPushZeroHashrateMiners() const { return m_ccPushZeroHashrateMiners; }
|
||||
inline bool ccUsePushover() const { return ccPushoverUser() && ccPushoverToken(); }
|
||||
inline bool ccUseTelegram() const { return ccTelegramBotToken() && ccTelegramChatId(); }
|
||||
inline bool forcePowVariant() const { return m_forcePowVariant; };
|
||||
inline const char *fileName() const { return m_fileName; }
|
||||
inline const char *apiToken() const { return m_apiToken; }
|
||||
inline const char *apiWorkerId() const { return m_apiWorkerId; }
|
||||
|
@ -165,6 +166,7 @@ private:
|
|||
bool m_ccPushOfflineMiners;
|
||||
bool m_ccPushPeriodicStatus;
|
||||
bool m_ccPushZeroHashrateMiners;
|
||||
bool m_forcePowVariant;
|
||||
const char* m_fileName;
|
||||
char *m_apiToken;
|
||||
char *m_apiWorkerId;
|
||||
|
|
|
@ -35,6 +35,8 @@ enum PowVariant
|
|||
POW_XHV,
|
||||
POW_RTO,
|
||||
POW_XFH,
|
||||
POW_XTL_V9,
|
||||
POW_UPX,
|
||||
LAST_ITEM
|
||||
};
|
||||
|
||||
|
@ -62,6 +64,10 @@ inline std::string getPowVariantName(PowVariant powVariant)
|
|||
return "rto";
|
||||
case POW_XFH:
|
||||
return "xfh";
|
||||
case POW_XTL_V9:
|
||||
return "xtlv9";
|
||||
case POW_UPX:
|
||||
return "upx";
|
||||
case POW_AUTODETECT:
|
||||
default:
|
||||
return "-1";
|
||||
|
@ -129,6 +135,10 @@ inline PowVariant parseVariant(const std::string variant)
|
|||
powVariant = PowVariant::POW_RTO;
|
||||
} else if (variant == "xfh" || variant == "freehaven" || variant == "faven") {
|
||||
powVariant = PowVariant::POW_XFH;
|
||||
} else if (variant == "xtlv9" || variant == "stellite_v9") {
|
||||
powVariant = PowVariant::POW_XTL_V9;
|
||||
} else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") {
|
||||
powVariant = PowVariant::POW_XTL_V9;
|
||||
}
|
||||
|
||||
return powVariant;
|
||||
|
|
|
@ -63,8 +63,8 @@ static void print_cpu()
|
|||
Cpu::brand(),
|
||||
Cpu::sockets(),
|
||||
Cpu::isX64() ? "\x1B[01;32m" : "\x1B[01;31m-",
|
||||
Cpu::hasAES() ? "\x1B[01;32m" : "\x1B[01;31m-",
|
||||
Options::i()->asmOptimization() != AsmOptimization::ASM_OFF ? "\x1B[01;32m" : "\x1B[01;31m",
|
||||
Cpu::hasAES() && Options::i()->aesni() ? "\x1B[01;32m" : "\x1B[01;31m-",
|
||||
Options::i()->asmOptimization() != AsmOptimization::ASM_OFF ? "\x1B[01;32m" : "\x1B[01;31m-",
|
||||
getAsmOptimizationName(Options::i()->asmOptimization()).c_str());
|
||||
# ifndef XMRIG_NO_LIBCPUID
|
||||
Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU L2/L3: %.1f MB/%.1f MB", Cpu::l2() / 1024.0, Cpu::l3() / 1024.0);
|
||||
|
@ -121,12 +121,10 @@ static void print_threads()
|
|||
}
|
||||
|
||||
Log::i()->text(Options::i()->colors() ?
|
||||
"\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, %saes=%d\x1B[01;37m, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" :
|
||||
"\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" :
|
||||
" * THREADS: %d, %s, %saes=%d, hf=%zu, %sdonate=%d%%%s%s",
|
||||
Options::i()->threads(),
|
||||
Options::i()->algoName(),
|
||||
Options::i()->colors() && Options::i()->aesni() == 0 ? "\x1B[01;31m" : "",
|
||||
Options::i()->aesni(),
|
||||
Options::i()->hashFactor(),
|
||||
Options::i()->colors() && Options::i()->donateLevel() == 0 ? "\x1B[01;31m" : "",
|
||||
Options::i()->donateLevel(),
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
|
||||
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
|
||||
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
|
||||
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for v5), msr, xhv, rto, xfh
|
||||
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx
|
||||
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
|
||||
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
|
||||
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
|
||||
|
@ -20,6 +20,7 @@
|
|||
"safe": false, // true to safe adjust threads and av settings for current CPU
|
||||
"syslog": false, // use system log for output messages
|
||||
"reboot-cmd" : "", // command to execute to reboot the OS
|
||||
"force-pow-variant" : false, // force pow variant, dont parse pow/variant from pool job
|
||||
"pools": [
|
||||
{
|
||||
"url": "donate2.graef.in:80", // URL of mining server
|
||||
|
|
|
@ -70,6 +70,18 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
|
|||
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_XTL_V9) {
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
#else
|
||||
if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) ||
|
||||
(asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) ||
|
||||
(asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization);
|
||||
} else {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_MSR) {
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
|
@ -111,6 +123,16 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV
|
|||
} else {
|
||||
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_XTL_V9) {
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
#else
|
||||
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization);
|
||||
} else {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_ALLOY) {
|
||||
CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
|
||||
|
@ -158,6 +180,16 @@ static void cryptonight_lite_aesni(AsmOptimization asmOptimization, PowVariant p
|
|||
#endif
|
||||
} else if (powVersion == PowVariant::POW_TUBE) {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
|
||||
} else if (powVersion == PowVariant::POW_UPX) {
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
#else
|
||||
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
||||
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
|
||||
} else {
|
||||
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
|
||||
}
|
||||
|
@ -178,6 +210,16 @@ static void cryptonight_lite_softaes(AsmOptimization asmOptimization, PowVariant
|
|||
#endif
|
||||
} else if (powVersion == PowVariant::POW_TUBE) {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
|
||||
} else if (powVersion == PowVariant::POW_UPX) {
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
#else
|
||||
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
||||
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
|
||||
} else {
|
||||
CryptoNightMultiHash<0x20000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
|
||||
}
|
||||
|
@ -430,6 +472,10 @@ bool CryptoNight::selfTest(int algo)
|
|||
resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 160) == 0;
|
||||
#endif
|
||||
|
||||
// cn-lite upx
|
||||
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_UPX, test_input, 76, output, scratchPads);
|
||||
resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0;
|
||||
} else {
|
||||
// cn v0 aka orignal
|
||||
|
||||
|
@ -525,6 +571,11 @@ bool CryptoNight::selfTest(int algo)
|
|||
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XFH, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xfh, 32) == 0;
|
||||
|
||||
// cnv8 + xtl aka cn-fast2
|
||||
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL_V9, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xtl_v9, 32) == 0;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) {
|
||||
|
|
|
@ -122,6 +122,12 @@ const static uint8_t test_output_xfh[32] = {
|
|||
0x54, 0x71, 0x58, 0xDB, 0x94, 0x69, 0x8E, 0x3C, 0xA0, 0x3D, 0xE4, 0x81, 0x9A, 0x65, 0x9F, 0xEF
|
||||
};
|
||||
|
||||
// CN XTL V9
|
||||
const static uint8_t test_output_xtl_v9[32] = {
|
||||
0x5D, 0x4F, 0xBC, 0x35, 0x60, 0x97, 0xEA, 0x64, 0x40, 0xB0, 0x88, 0x8E, 0xDE, 0xB6, 0x35, 0xDD,
|
||||
0xC8, 0x4A, 0x0E, 0x39, 0x7C, 0x86, 0x84, 0x56, 0x89, 0x5C, 0x3F, 0x29, 0xBE, 0x73, 0x12, 0xA7
|
||||
};
|
||||
|
||||
// CN-LITE
|
||||
const static uint8_t test_output_v0_lite[160] = {
|
||||
0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
|
||||
|
@ -151,7 +157,6 @@ const static uint8_t test_output_v1_lite[160] = {
|
|||
0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6
|
||||
};
|
||||
|
||||
|
||||
// CN-Lite IPBC
|
||||
const static uint8_t test_output_ipbc_lite[160] = {
|
||||
0xE4, 0x93, 0x8C, 0xAA, 0x59, 0x8D, 0x02, 0x8A, 0xB8, 0x6F, 0x25, 0xD2, 0xB1, 0x23, 0xD0, 0xD5,
|
||||
|
@ -167,6 +172,12 @@ const static uint8_t test_output_ipbc_lite[160] = {
|
|||
};
|
||||
|
||||
|
||||
// CN-Lite v7
|
||||
const static uint8_t test_output_upx[32] = {
|
||||
0xD1, 0x13, 0xE1, 0x1B, 0xBE, 0xD3, 0x2A, 0xC1, 0x7C, 0x2C, 0xAA, 0x55, 0xCC, 0x84, 0x2F, 0xA4,
|
||||
0x88, 0x91, 0xEE, 0x45, 0x63, 0x22, 0xA3, 0x0A, 0xB2, 0x80, 0xDF, 0x35, 0x16, 0x5C, 0xAF, 0x9A
|
||||
};
|
||||
|
||||
// CN-Heavy
|
||||
const static uint8_t test_output_heavy[160] = {
|
||||
0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64,
|
||||
|
|
|
@ -57,10 +57,17 @@ extern "C"
|
|||
void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
void cn_fastv2_mainloop_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_mainloop_ryzen_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -768,7 +775,8 @@ public:
|
|||
uint64_t* h[NUM_HASH_BLOCKS];
|
||||
uint64_t al[NUM_HASH_BLOCKS];
|
||||
uint64_t ah[NUM_HASH_BLOCKS];
|
||||
uint64_t idx[NUM_HASH_BLOCKS];
|
||||
uint64_t idx[NUM_HASH_BLOCKS];CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(
|
||||
input, size, output, scratchPad);
|
||||
uint64_t sqrt_result[NUM_HASH_BLOCKS];
|
||||
__m128i bx0[NUM_HASH_BLOCKS];
|
||||
__m128i bx1[NUM_HASH_BLOCKS];
|
||||
|
@ -875,6 +883,15 @@ public:
|
|||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad,
|
||||
AsmOptimization asmOptimization)
|
||||
{
|
||||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
|
@ -1433,6 +1450,8 @@ public:
|
|||
} else {
|
||||
cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else {
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else {
|
||||
if (ITERATIONS == 0x80000) {
|
||||
|
@ -1443,6 +1462,8 @@ public:
|
|||
} else {
|
||||
cn_litev1_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else {
|
||||
cn_liteupx_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1559,6 +1580,40 @@ public:
|
|||
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
||||
}
|
||||
|
||||
// single asm
|
||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad,
|
||||
AsmOptimization asmOptimization)
|
||||
{
|
||||
const uint8_t* l = scratchPad[0]->memory;
|
||||
uint64_t* h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
|
||||
|
||||
keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
|
||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
|
||||
|
||||
#ifndef XMRIG_NO_ASM
|
||||
if (asmOptimization == AsmOptimization::ASM_INTEL) {
|
||||
if (SOFT_AES) {
|
||||
scratchPad[0]->input = input;
|
||||
scratchPad[0]->t_fn = (const uint32_t*)saes_table;
|
||||
cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
|
||||
cn_fastv2_mainloop_ryzen_asm(scratchPad[0]);
|
||||
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
|
||||
cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||
}
|
||||
#endif
|
||||
|
||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
|
||||
keccakf(h, 24);
|
||||
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
||||
}
|
||||
|
||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
|
@ -2278,6 +2333,38 @@ public:
|
|||
extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
|
||||
}
|
||||
|
||||
// double asm
|
||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad,
|
||||
AsmOptimization asmOptimization)
|
||||
{
|
||||
keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
|
||||
keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
|
||||
|
||||
const uint8_t* l0 = scratchPad[0]->memory;
|
||||
const uint8_t* l1 = scratchPad[1]->memory;
|
||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
|
||||
uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
|
||||
|
||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
|
||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
|
||||
|
||||
#ifndef XMRIG_NO_ASM
|
||||
cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||
#endif
|
||||
|
||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
|
||||
|
||||
keccakf(h0, 24);
|
||||
keccakf(h1, 24);
|
||||
|
||||
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
||||
extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
|
||||
}
|
||||
|
||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
|
@ -3225,6 +3312,15 @@ public:
|
|||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad,
|
||||
AsmOptimization asmOptimization)
|
||||
{
|
||||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
|
@ -4480,6 +4576,15 @@ public:
|
|||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad,
|
||||
AsmOptimization asmOptimization)
|
||||
{
|
||||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
|
@ -5405,6 +5510,15 @@ public:
|
|||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad,
|
||||
AsmOptimization asmOptimization)
|
||||
{
|
||||
// not supported
|
||||
}
|
||||
|
||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
|
|
414
src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc
Normal file
414
src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc
Normal file
|
@ -0,0 +1,414 @@
|
|||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 262144
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
main_loop_double_fast2_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_fast2_sandybridge
|
||||
div_fix_1_ret_fast2_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_fast2_sandybridge
|
||||
div_fix_2_ret_fast2_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_fast2_sandybridge
|
||||
sqrt_fix_1_ret_fast2_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_fast2_sandybridge
|
||||
sqrt_fix_2_ret_fast2_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_fast2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp
|
||||
|
||||
div_fix_1_fast2_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_fast2_sandybridge
|
||||
|
||||
div_fix_2_fast2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_fast2_sandybridge
|
||||
|
||||
sqrt_fix_1_fast2_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_fast2_sandybridge
|
||||
|
||||
sqrt_fix_2_fast2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_fast2_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_fast2_sandybridge_endp:
|
180
src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc
Normal file
180
src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc
Normal file
|
@ -0,0 +1,180 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_fast2_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movq r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movq xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_fast2_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_fast2_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_fast2_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_fast2_bulldozer_endp
|
||||
|
||||
sqrt_fixup_fast2_bulldozer:
|
||||
movq r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_fast2_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_fast2_bulldozer_endp:
|
186
src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc
Normal file
186
src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc
Normal file
|
@ -0,0 +1,186 @@
|
|||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_fast2_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_fast2_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_fast2_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_fast2_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_fast2_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_fast2_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_fast2_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ivybridge_endp:
|
183
src/crypto/asm/cn_fastv2_main_loop_ryzen.inc
Normal file
183
src/crypto/asm/cn_fastv2_main_loop_ryzen.inc
Normal file
|
@ -0,0 +1,183 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_fast2_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movq r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
|
||||
div r9
|
||||
movq xmm0, rax
|
||||
movq xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movq r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_fast2_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_fast2_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_fast2_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_fast2_ryzen_endp
|
||||
|
||||
$sqrt_fixup_fast2_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_fast2_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ryzen_endp:
|
271
src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc
Normal file
271
src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc
Normal file
|
@ -0,0 +1,271 @@
|
|||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 152
|
||||
|
||||
stmxcsr DWORD PTR [rsp+4]
|
||||
mov DWORD PTR [rsp], 24448
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 262144
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+248]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
movq rbx, xmm10
|
||||
xor rbx, rax
|
||||
lea r9, QWORD PTR [rcx+rcx]
|
||||
add r9d, edi
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
mov ecx, -2147483647
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
or r9, rcx
|
||||
movdqa xmm0, xmm6
|
||||
movaps xmm1, xmm9
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
div r9
|
||||
shl rdx, 32
|
||||
mov eax, eax
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rdi]
|
||||
movq xmm10, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_fast2_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+240]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+224]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_fast2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 152
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp
|
||||
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+r15+1]
|
||||
add rax, r15
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp:
|
74
src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc
Normal file
74
src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc
Normal file
|
@ -0,0 +1,74 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 131072
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 1048560
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_liteupx_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 1048560
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 1048560
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_liteupx_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
166
src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc
Normal file
166
src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc
Normal file
|
@ -0,0 +1,166 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 1048560
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 131072
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 1048560
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 1048560
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_liteupx_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -14,11 +14,18 @@
|
|||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
|
@ -105,6 +112,67 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
|||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_main_loop_ivybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_ryzen_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_main_loop_ryzen.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_main_loop_bulldozer.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
mov rdx, rsi
|
||||
#include "cn_fastv2_double_main_loop_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_liteupx_mainloop_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
|
@ -152,3 +220,27 @@ FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
|
|||
#include "cnv2_mainloop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_mainloop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
|
410
src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc
Normal file
410
src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc
Normal file
|
@ -0,0 +1,410 @@
|
|||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 262144
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN 64
|
||||
main_loop_double_fast2_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_fast2_sandybridge
|
||||
div_fix_1_ret_fast2_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_fast2_sandybridge
|
||||
div_fix_2_ret_fast2_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_fast2_sandybridge
|
||||
sqrt_fix_1_ret_fast2_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_fast2_sandybridge
|
||||
sqrt_fix_2_ret_fast2_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_fast2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp
|
||||
|
||||
div_fix_1_fast2_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_fast2_sandybridge
|
||||
|
||||
div_fix_2_fast2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_fast2_sandybridge
|
||||
|
||||
sqrt_fix_1_fast2_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_fast2_sandybridge
|
||||
|
||||
sqrt_fix_2_fast2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_fast2_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_fast2_sandybridge_endp:
|
180
src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc
Normal file
180
src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc
Normal file
|
@ -0,0 +1,180 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_fast2_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movd xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movd r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movd xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movd rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_fast2_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_fast2_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movd xmm1, rax
|
||||
movd xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_fast2_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_fast2_bulldozer_endp
|
||||
|
||||
sqrt_fixup_fast2_bulldozer:
|
||||
movd r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_fast2_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_fast2_bulldozer_endp:
|
182
src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc
Normal file
182
src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc
Normal file
|
@ -0,0 +1,182 @@
|
|||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_fast2_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_fast2_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_fast2_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_fast2_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_fast2_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_fast2_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_fast2_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ivybridge_endp:
|
179
src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc
Normal file
179
src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc
Normal file
|
@ -0,0 +1,179 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_fast2_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movq r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
|
||||
div r9
|
||||
movq xmm0, rax
|
||||
movq xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movq r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_fast2_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_fast2_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_fast2_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_fast2_ryzen_endp
|
||||
|
||||
$sqrt_fixup_fast2_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_fast2_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ryzen_endp:
|
267
src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc
Normal file
267
src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc
Normal file
|
@ -0,0 +1,267 @@
|
|||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 152
|
||||
|
||||
stmxcsr DWORD PTR [rsp+4]
|
||||
mov DWORD PTR [rsp], 24448
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 262144
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+248]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
movq rbx, xmm10
|
||||
xor rbx, rax
|
||||
lea r9, QWORD PTR [rcx+rcx]
|
||||
add r9d, edi
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
mov ecx, -2147483647
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
or r9, rcx
|
||||
movdqa xmm0, xmm6
|
||||
movaps xmm1, xmm9
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
div r9
|
||||
shl rdx, 32
|
||||
mov eax, eax
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rdi]
|
||||
movq xmm10, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_fast2_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+240]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+224]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_fast2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 152
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp
|
||||
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+r15+1]
|
||||
add rax, r15
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp:
|
70
src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc
Normal file
70
src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc
Normal file
|
@ -0,0 +1,70 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 131072
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 1048560
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 1048560
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 1048560
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_litev1_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
162
src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc
Normal file
162
src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc
Normal file
|
@ -0,0 +1,162 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 1048560
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 131072
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 1048560
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 1048560
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_litev1_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -6,11 +6,18 @@ PUBLIC cnv2_mainloop_ivybridge_asm
|
|||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||
PUBLIC cn_fast2_mainloop_ivybridge_asm
|
||||
PUBLIC cn_fast2_mainloop_ryzen_asm
|
||||
PUBLIC cn_fast2_mainloop_bulldozer_asm
|
||||
PUBLIC cn_fast2_double_mainloop_sandybridge_asm
|
||||
PUBLIC cn_liteupx_mainloop_sandybridge_asm
|
||||
|
||||
PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_fast2_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm
|
||||
|
||||
ALIGN 64
|
||||
cnv1_mainloop_sandybridge_asm PROC
|
||||
|
@ -54,6 +61,36 @@ cnv2_double_mainloop_sandybridge_asm PROC
|
|||
ret 0
|
||||
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cn_fast2_main_loop_ivybridge.inc
|
||||
ret 0
|
||||
cn_fast2_mainloop_ivybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast2_mainloop_ryzen_asm PROC
|
||||
INCLUDE cn_fast2_main_loop_ryzen.inc
|
||||
ret 0
|
||||
cn_fast2_mainloop_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cn_fast2_main_loop_bulldozer.inc
|
||||
ret 0
|
||||
cn_fast2_mainloop_bulldozer_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_fast2_double_main_loop_sandybridge.inc
|
||||
ret 0
|
||||
cn_fast2_double_mainloop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_liteupx_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_liteupx_mainloop_sandybridge.inc
|
||||
ret 0
|
||||
cn_liteupx_mainloop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv1_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc
|
||||
|
@ -78,5 +115,17 @@ cnv2_mainloop_soft_aes_sandybridge_asm PROC
|
|||
ret 0
|
||||
cnv2_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast2_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_fast2_mainloop_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_fast2_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
_TEXT_CN_MAINLOOP ENDS
|
||||
END
|
||||
|
||||
ALIGN 64
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_liteupx_mainloop_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP
|
|
@ -10,11 +10,18 @@
|
|||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cn_fast2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cn_fast2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cn_fast2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv1_mainloop_sandybridge_asm):
|
||||
|
@ -51,6 +58,31 @@ FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
|||
#include "../cnv2_double_main_loop_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast2_mainloop_ivybridge_asm):
|
||||
#include "../cn_fast2_main_loop_ivybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast2_mainloop_ryzen_asm):
|
||||
#include "../cn_fast2_main_loop_ryzen.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast2_mainloop_bulldozer_asm):
|
||||
#include "../cn_fast2_main_loop_bulldozer.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast2_double_mainloop_sandybridge_asm):
|
||||
#include "../cn_fast2_double_main_loop_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm):
|
||||
#include "../cn_liteupx_mainloop_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cnv1_mainloop_soft_aes_sandybridge.inc"
|
||||
|
@ -70,3 +102,14 @@ ALIGN 64
|
|||
FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cnv2_mainloop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast2_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_fast2_mainloop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
|
@ -4,7 +4,7 @@
|
|||
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
|
||||
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
|
||||
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
|
||||
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for v5), msr, xhv, rto, xfh
|
||||
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx
|
||||
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
|
||||
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
|
||||
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
|
||||
|
@ -20,6 +20,7 @@
|
|||
"safe": false, // true to safe adjust threads and av settings for current CPU
|
||||
"syslog": false, // use system log for output messages
|
||||
"reboot-cmd" : "", // command to execute to reboot the OS
|
||||
"force-pow-variant" : false, // force pow variant, dont parse pow/variant from pool job
|
||||
"pools": [
|
||||
{
|
||||
"url": "donate2.graef.in:80", // URL of mining server
|
||||
|
|
|
@ -231,16 +231,17 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code)
|
|||
|
||||
PowVariant powVariant = Options::i()->powVariant();
|
||||
|
||||
if (!Options::i()->forcePowVariant()) {
|
||||
if (params.HasMember("algo")) {
|
||||
std::string algo = params["algo"].GetString();
|
||||
|
||||
if (algo.find("/") != std::string::npos) {
|
||||
powVariant = parseVariant(algo.substr(algo.find("/")+1));
|
||||
powVariant = parseVariant(algo.substr(algo.find("/") + 1));
|
||||
}
|
||||
}
|
||||
|
||||
if (params.HasMember("variant")) {
|
||||
const rapidjson::Value &variant = params["variant"];
|
||||
const rapidjson::Value& variant = params["variant"];
|
||||
|
||||
PowVariant parsedVariant = powVariant;
|
||||
|
||||
|
@ -254,6 +255,7 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code)
|
|||
powVariant = parsedVariant;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
job.setPowVariant(powVariant);
|
||||
|
||||
|
|
|
@ -146,6 +146,10 @@ PowVariant Job::powVariant() const
|
|||
} else {
|
||||
return PowVariant::POW_V0;
|
||||
}
|
||||
} else if (m_powVariant == PowVariant::POW_XTL) {
|
||||
if (m_blob[0] > 5) {
|
||||
return PowVariant::POW_XTL_V9;
|
||||
}
|
||||
} else {
|
||||
return m_powVariant;
|
||||
}
|
||||
|
|
|
@ -36,14 +36,14 @@
|
|||
#define APP_DESC "XMRigCC CPU miner"
|
||||
#define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id"
|
||||
#endif
|
||||
#define APP_VERSION "1.8.7 (based on XMRig)"
|
||||
#define APP_VERSION "1.8.8 (based on XMRig)"
|
||||
#define APP_DOMAIN ""
|
||||
#define APP_SITE "https://github.com/Bendr0id/xmrigCC"
|
||||
#define APP_KIND "cpu"
|
||||
|
||||
#define APP_VER_MAJOR 1
|
||||
#define APP_VER_MINOR 8
|
||||
#define APP_VER_BUILD 7
|
||||
#define APP_VER_BUILD 8
|
||||
#define APP_VER_REV 0
|
||||
|
||||
#ifndef NDEBUG
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue