Integrated cryptonight-ultralite + optimizations (#226)
* Integrated cryptonight-ultralite + optimizations
This commit is contained in:
parent
b101db98aa
commit
4c995ea443
29 changed files with 2788 additions and 137 deletions
|
@ -1,3 +1,5 @@
|
||||||
|
# 1.8.9
|
||||||
|
- Added cn-ultralite algo used by upcoming TurtleV2 fork (algo: "cryptonight-ultralite", variant "auto")
|
||||||
# 1.8.8
|
# 1.8.8
|
||||||
- Added XLT v5/9 with autodetect(algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9))
|
- Added XLT v5/9 with autodetect(algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9))
|
||||||
- Added cn-lite variant UPX/uPlexa (algo: "cryptonight-lite", variant "upx")
|
- Added cn-lite variant UPX/uPlexa (algo: "cryptonight-lite", variant "upx")
|
||||||
|
|
|
@ -27,6 +27,7 @@ Full Windows/Linux compatible, and you can mix Linux and Windows miner on one XM
|
||||||
## Additional features of XMRigCC (on top of XMRig)
|
## Additional features of XMRigCC (on top of XMRig)
|
||||||
|
|
||||||
Check the [Coin Configuration](https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations) guide
|
Check the [Coin Configuration](https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations) guide
|
||||||
|
* **NEW Support of Crytptonight-Ultralite TRTL/Turtle variant (algo: "cryptonight-ultralite", variant "auto")**
|
||||||
* **NEW Support of Crytptonight-Lite UPX/uPlexa variant (algo: "cryptonight-lite", variant "upx")**
|
* **NEW Support of Crytptonight-Lite UPX/uPlexa variant (algo: "cryptonight-lite", variant "upx")**
|
||||||
* **NEW Support of Crytptonight XTL v5/v9 PoW changes aka CN-FastV2 (algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9))**
|
* **NEW Support of Crytptonight XTL v5/v9 PoW changes aka CN-FastV2 (algo: "cryptonight", variant: "xtl" (autodetect), "xtlv9" (force v9))**
|
||||||
* **Support of Crytptonight XFH/SWAP variant aka CN-Heavy-Fast**
|
* **Support of Crytptonight XFH/SWAP variant aka CN-Heavy-Fast**
|
||||||
|
|
17
src/Cpu.cpp
17
src/Cpu.cpp
|
@ -67,6 +67,12 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor,
|
||||||
size_t cache = availableCache();
|
size_t cache = availableCache();
|
||||||
size_t algoBlockSize;
|
size_t algoBlockSize;
|
||||||
switch (algo) {
|
switch (algo) {
|
||||||
|
case Options::ALGO_CRYPTONIGHT_ULTRALITE:
|
||||||
|
algoBlockSize = 256;
|
||||||
|
break;
|
||||||
|
case Options::ALGO_CRYPTONIGHT_SUPERLITE:
|
||||||
|
algoBlockSize = 512;
|
||||||
|
break;
|
||||||
case Options::ALGO_CRYPTONIGHT_LITE:
|
case Options::ALGO_CRYPTONIGHT_LITE:
|
||||||
algoBlockSize = 1024;
|
algoBlockSize = 1024;
|
||||||
break;
|
break;
|
||||||
|
@ -81,8 +87,17 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor,
|
||||||
|
|
||||||
size_t maximumReasonableFactor = std::max(cache / algoBlockSize, static_cast<size_t>(1ul));
|
size_t maximumReasonableFactor = std::max(cache / algoBlockSize, static_cast<size_t>(1ul));
|
||||||
size_t maximumReasonableThreadCount = std::min(maximumReasonableFactor, m_totalThreads);
|
size_t maximumReasonableThreadCount = std::min(maximumReasonableFactor, m_totalThreads);
|
||||||
size_t maximumReasonableHashFactor = std::min(maximumReasonableFactor, (algo == Options::ALGO_CRYPTONIGHT_HEAVY || powVariant == POW_XFH) ? 3 : static_cast<size_t>(MAX_NUM_HASH_BLOCKS));
|
size_t maximumReasonableHashFactor = static_cast<size_t>(MAX_NUM_HASH_BLOCKS);
|
||||||
|
|
||||||
|
if (algo == Options::ALGO_CRYPTONIGHT_HEAVY || powVariant == POW_XFH) {
|
||||||
|
maximumReasonableHashFactor = 3;
|
||||||
|
} else if (algo == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
|
||||||
|
if (m_asmOptimization == ASM_INTEL) {
|
||||||
|
maximumReasonableHashFactor = 2;
|
||||||
|
} else {
|
||||||
|
maximumReasonableHashFactor = 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
if (safeMode) {
|
if (safeMode) {
|
||||||
if (threadsCount > maximumReasonableThreadCount) {
|
if (threadsCount > maximumReasonableThreadCount) {
|
||||||
threadsCount = maximumReasonableThreadCount;
|
threadsCount = maximumReasonableThreadCount;
|
||||||
|
|
|
@ -40,6 +40,12 @@ ScratchPadMem Mem::create(ScratchPad** scratchPads, int threadId)
|
||||||
size_t scratchPadSize;
|
size_t scratchPadSize;
|
||||||
|
|
||||||
switch (m_algo) {
|
switch (m_algo) {
|
||||||
|
case Options::ALGO_CRYPTONIGHT_ULTRALITE:
|
||||||
|
scratchPadSize = MEMORY_ULTRA_LITE;
|
||||||
|
break;
|
||||||
|
case Options::ALGO_CRYPTONIGHT_SUPERLITE:
|
||||||
|
scratchPadSize = MEMORY_SUPER_LITE;
|
||||||
|
break;
|
||||||
case Options::ALGO_CRYPTONIGHT_LITE:
|
case Options::ALGO_CRYPTONIGHT_LITE:
|
||||||
scratchPadSize = MEMORY_LITE;
|
scratchPadSize = MEMORY_LITE;
|
||||||
break;
|
break;
|
||||||
|
|
|
@ -63,7 +63,7 @@ Usage: " APP_ID " [OPTIONS]\n\
|
||||||
Options:\n"
|
Options:\n"
|
||||||
# ifndef XMRIG_CC_SERVER
|
# ifndef XMRIG_CC_SERVER
|
||||||
"\
|
"\
|
||||||
-a, --algo=ALGO cryptonight (default), cryptonight-lite or cryptonight-heavy\n\
|
-a, --algo=ALGO cryptonight (default), cryptonight-lite, cryptonight-ultralite or cryptonight-heavy\n\
|
||||||
-o, --url=URL URL of mining server\n\
|
-o, --url=URL URL of mining server\n\
|
||||||
-O, --userpass=U:P username:password pair for mining server\n\
|
-O, --userpass=U:P username:password pair for mining server\n\
|
||||||
-u, --user=USERNAME username for mining server\n\
|
-u, --user=USERNAME username for mining server\n\
|
||||||
|
@ -73,7 +73,7 @@ Options:\n"
|
||||||
-k, --keepalive send keepalived for prevent timeout (need pool support)\n\
|
-k, --keepalive send keepalived for prevent timeout (need pool support)\n\
|
||||||
-r, --retries=N number of times to retry before switch to backup server (default: 5)\n\
|
-r, --retries=N number of times to retry before switch to backup server (default: 5)\n\
|
||||||
-R, --retry-pause=N time to pause between retries (default: 5)\n\
|
-R, --retry-pause=N time to pause between retries (default: 5)\n\
|
||||||
--pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for > v5), 'rto', 'xfh', 'upx'\n\
|
--pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for > v5), 'rto', 'xfh', 'upx', 'turtle'\n\
|
||||||
for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\
|
for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\
|
||||||
--asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\
|
--asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\
|
||||||
--multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\
|
--multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\
|
||||||
|
@ -303,12 +303,16 @@ static struct option const cc_server_options[] = {
|
||||||
static const char *algo_names[] = {
|
static const char *algo_names[] = {
|
||||||
"cryptonight",
|
"cryptonight",
|
||||||
"cryptonight-lite",
|
"cryptonight-lite",
|
||||||
|
"cryptonight-superlite",
|
||||||
|
"cryptonight-ultralite",
|
||||||
"cryptonight-heavy"
|
"cryptonight-heavy"
|
||||||
};
|
};
|
||||||
|
|
||||||
static const char *algo_short_names[] = {
|
static const char *algo_short_names[] = {
|
||||||
"cn",
|
"cn",
|
||||||
"cn-lite",
|
"cn-lite",
|
||||||
|
"cn-superlite",
|
||||||
|
"cn-ultralite",
|
||||||
"cn-heavy"
|
"cn-heavy"
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -325,7 +329,8 @@ constexpr static const char *pow_variant_names[] = {
|
||||||
"rto",
|
"rto",
|
||||||
"xfh",
|
"xfh",
|
||||||
"xtlv9",
|
"xtlv9",
|
||||||
"upx"
|
"upx",
|
||||||
|
"turtle"
|
||||||
};
|
};
|
||||||
|
|
||||||
constexpr static const char *asm_optimization_names[] = {
|
constexpr static const char *asm_optimization_names[] = {
|
||||||
|
@ -1086,6 +1091,17 @@ bool Options::setAlgo(const char *algo)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cn-super-lite") || !strcmp(algo, "cryptonight-super-lite") || !strcmp(algo, "cryptonight-superlight"))) {
|
||||||
|
m_algo = ALGO_CRYPTONIGHT_SUPERLITE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cn-ultra-lite") || !strcmp(algo, "cryptonight-ultra-lite") || !strcmp(algo, "cryptonight-ultralight"))) {
|
||||||
|
m_algo = ALGO_CRYPTONIGHT_ULTRALITE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cryptonight-lite-ipbc") || !strcmp(algo, "cryptonight-light-ipbc") || !strcmp(algo, "cn-lite-ipbc"))) {
|
if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cryptonight-lite-ipbc") || !strcmp(algo, "cryptonight-light-ipbc") || !strcmp(algo, "cn-lite-ipbc"))) {
|
||||||
showDeprecateWarning("cryptonight-light-ipbc", "cryptonight-light (with variant \"ipbc\")");
|
showDeprecateWarning("cryptonight-light-ipbc", "cryptonight-light (with variant \"ipbc\")");
|
||||||
m_algo = ALGO_CRYPTONIGHT_LITE;
|
m_algo = ALGO_CRYPTONIGHT_LITE;
|
||||||
|
@ -1160,7 +1176,7 @@ bool Options::parsePowVariant(const char *powVariant)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "stellitev9")) {
|
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2"))) {
|
||||||
m_powVariant = POW_XTL_V9;
|
m_powVariant = POW_XTL_V9;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -1170,6 +1186,11 @@ bool Options::parsePowVariant(const char *powVariant)
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "trtl")) {
|
||||||
|
m_powVariant = POW_TURTLE;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
if (i == ARRAY_SIZE(pow_variant_names) - 1) {
|
if (i == ARRAY_SIZE(pow_variant_names) - 1) {
|
||||||
showUsage(1);
|
showUsage(1);
|
||||||
return false;
|
return false;
|
||||||
|
|
|
@ -46,7 +46,9 @@ public:
|
||||||
enum Algo {
|
enum Algo {
|
||||||
ALGO_CRYPTONIGHT, /* CryptoNight (2MB ScratchPad) */
|
ALGO_CRYPTONIGHT, /* CryptoNight (2MB ScratchPad) */
|
||||||
ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (1MB ScratchPad) */
|
ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (1MB ScratchPad) */
|
||||||
ALGO_CRYPTONIGHT_HEAVY /* CryptoNight-Heavy (4MB ScratchPad) */
|
ALGO_CRYPTONIGHT_SUPERLITE, /* CryptoNight-Superlite (512KB ScratchPad) */
|
||||||
|
ALGO_CRYPTONIGHT_ULTRALITE, /* CryptoNight-Ultralite (256KB ScratchPad) */
|
||||||
|
ALGO_CRYPTONIGHT_HEAVY, /* CryptoNight-Heavy (4MB ScratchPad) */
|
||||||
};
|
};
|
||||||
|
|
||||||
enum AlgoVariant {
|
enum AlgoVariant {
|
||||||
|
|
|
@ -37,6 +37,7 @@ enum PowVariant
|
||||||
POW_XFH,
|
POW_XFH,
|
||||||
POW_XTL_V9,
|
POW_XTL_V9,
|
||||||
POW_UPX,
|
POW_UPX,
|
||||||
|
POW_TURTLE,
|
||||||
LAST_ITEM
|
LAST_ITEM
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -68,6 +69,8 @@ inline std::string getPowVariantName(PowVariant powVariant)
|
||||||
return "xtlv9";
|
return "xtlv9";
|
||||||
case POW_UPX:
|
case POW_UPX:
|
||||||
return "upx";
|
return "upx";
|
||||||
|
case POW_TURTLE:
|
||||||
|
return "turtle";
|
||||||
case POW_AUTODETECT:
|
case POW_AUTODETECT:
|
||||||
default:
|
default:
|
||||||
return "-1";
|
return "-1";
|
||||||
|
@ -135,10 +138,12 @@ inline PowVariant parseVariant(const std::string variant)
|
||||||
powVariant = PowVariant::POW_RTO;
|
powVariant = PowVariant::POW_RTO;
|
||||||
} else if (variant == "xfh" || variant == "freehaven" || variant == "faven") {
|
} else if (variant == "xfh" || variant == "freehaven" || variant == "faven") {
|
||||||
powVariant = PowVariant::POW_XFH;
|
powVariant = PowVariant::POW_XFH;
|
||||||
} else if (variant == "xtlv9" || variant == "stellite_v9") {
|
} else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2") {
|
||||||
powVariant = PowVariant::POW_XTL_V9;
|
powVariant = PowVariant::POW_XTL_V9;
|
||||||
} else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") {
|
} else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") {
|
||||||
powVariant = PowVariant::POW_UPX;
|
powVariant = PowVariant::POW_UPX;
|
||||||
|
} else if (variant == "turtle" || variant == "trtl") {
|
||||||
|
powVariant = PowVariant::POW_TURTLE;
|
||||||
}
|
}
|
||||||
|
|
||||||
return powVariant;
|
return powVariant;
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
{
|
{
|
||||||
"algo": "cryptonight", // cryptonight (default), cryptonight-lite or cryptonight-heavy
|
"algo": "cryptonight", // cryptonight (default), cryptonight-lite, cryptonight-ultralite or cryptonight-heavy
|
||||||
"aesni": 0, // selection of AES-NI mode (0 auto, 1 on, 2 off)
|
"aesni": 0, // selection of AES-NI mode (0 auto, 1 on, 2 off)
|
||||||
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
|
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
|
||||||
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
|
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
|
||||||
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
|
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
|
||||||
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx
|
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx, turtle
|
||||||
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
|
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
|
||||||
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
|
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
|
||||||
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
|
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
|
||||||
|
|
|
@ -77,7 +77,7 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
|
||||||
if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) ||
|
if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) ||
|
||||||
(asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) ||
|
(asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) ||
|
||||||
(asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) {
|
(asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) {
|
||||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization);
|
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
|
||||||
} else {
|
} else {
|
||||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||||
}
|
}
|
||||||
|
@ -129,7 +129,7 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV
|
||||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||||
#else
|
#else
|
||||||
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
||||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowFastV2_asm(input, size, output, scratchPad, asmOptimization);
|
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
|
||||||
} else {
|
} else {
|
||||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||||
}
|
}
|
||||||
|
@ -225,6 +225,44 @@ static void cryptonight_lite_softaes(AsmOptimization asmOptimization, PowVariant
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <size_t NUM_HASH_BLOCKS>
|
||||||
|
static void cryptonight_super_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t NUM_HASH_BLOCKS>
|
||||||
|
static void cryptonight_super_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t NUM_HASH_BLOCKS>
|
||||||
|
static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
|
||||||
|
#if defined(XMRIG_ARM)
|
||||||
|
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||||
|
#else
|
||||||
|
if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) ||
|
||||||
|
(asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) ||
|
||||||
|
(asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) {
|
||||||
|
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
|
||||||
|
} else {
|
||||||
|
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
template <size_t NUM_HASH_BLOCKS>
|
||||||
|
static void cryptonight_ultra_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
|
||||||
|
#if defined(XMRIG_ARM)
|
||||||
|
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||||
|
#else
|
||||||
|
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
||||||
|
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
|
||||||
|
} else {
|
||||||
|
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
template <size_t NUM_HASH_BLOCKS>
|
template <size_t NUM_HASH_BLOCKS>
|
||||||
static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
|
static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
|
||||||
# if !defined(XMRIG_ARMv7)
|
# if !defined(XMRIG_ARMv7)
|
||||||
|
@ -275,6 +313,22 @@ void setCryptoNightHashMethods(Options::Algo algo, bool aesni)
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
case Options::ALGO_CRYPTONIGHT_SUPERLITE:
|
||||||
|
if (aesni) {
|
||||||
|
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_super_lite_aesni<HASH_FACTOR>;
|
||||||
|
} else {
|
||||||
|
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_super_lite_softaes<HASH_FACTOR>;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
|
case Options::ALGO_CRYPTONIGHT_ULTRALITE:
|
||||||
|
if (aesni) {
|
||||||
|
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_ultra_lite_aesni<HASH_FACTOR>;
|
||||||
|
} else {
|
||||||
|
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_ultra_lite_softaes<HASH_FACTOR>;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
|
||||||
case Options::ALGO_CRYPTONIGHT_HEAVY:
|
case Options::ALGO_CRYPTONIGHT_HEAVY:
|
||||||
if (aesni) {
|
if (aesni) {
|
||||||
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_heavy_aesni<HASH_FACTOR>;
|
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_heavy_aesni<HASH_FACTOR>;
|
||||||
|
@ -328,7 +382,8 @@ bool CryptoNight::selfTest(int algo)
|
||||||
#if MAX_NUM_HASH_BLOCKS > 4
|
#if MAX_NUM_HASH_BLOCKS > 4
|
||||||
|| cryptonight_hash_ctx[4] == nullptr
|
|| cryptonight_hash_ctx[4] == nullptr
|
||||||
#endif
|
#endif
|
||||||
) {
|
)
|
||||||
|
{
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -345,6 +400,8 @@ bool CryptoNight::selfTest(int algo)
|
||||||
|
|
||||||
bool result = true;
|
bool result = true;
|
||||||
bool resultLite = true;
|
bool resultLite = true;
|
||||||
|
bool resultSuperLite = true;
|
||||||
|
bool resultUltraLite = true;
|
||||||
bool resultHeavy = true;
|
bool resultHeavy = true;
|
||||||
|
|
||||||
AsmOptimization asmOptimization = Options::i()->asmOptimization();
|
AsmOptimization asmOptimization = Options::i()->asmOptimization();
|
||||||
|
@ -476,6 +533,17 @@ bool CryptoNight::selfTest(int algo)
|
||||||
|
|
||||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_UPX, test_input, 76, output, scratchPads);
|
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_UPX, test_input, 76, output, scratchPads);
|
||||||
resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0;
|
resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0;
|
||||||
|
|
||||||
|
} else if (algo == Options::ALGO_CRYPTONIGHT_SUPERLITE) {
|
||||||
|
|
||||||
|
return false;
|
||||||
|
|
||||||
|
} else if (algo == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
|
||||||
|
// cn ultralite (cnv8 + turtle)
|
||||||
|
|
||||||
|
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads);
|
||||||
|
resultUltraLite = resultUltraLite && memcmp(output, test_output_turtle, 32) == 0;
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
// cn v0 aka orignal
|
// cn v0 aka orignal
|
||||||
|
|
||||||
|
@ -583,5 +651,5 @@ bool CryptoNight::selfTest(int algo)
|
||||||
_mm_free(scratchPads[i]);
|
_mm_free(scratchPads[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
return result && resultLite & resultHeavy;
|
return result && resultLite && resultSuperLite && resultUltraLite && resultHeavy;
|
||||||
}
|
}
|
|
@ -33,6 +33,8 @@
|
||||||
|
|
||||||
#define MEMORY 2097152 /* 2 MiB */
|
#define MEMORY 2097152 /* 2 MiB */
|
||||||
#define MEMORY_LITE 1048576 /* 1 MiB */
|
#define MEMORY_LITE 1048576 /* 1 MiB */
|
||||||
|
#define MEMORY_SUPER_LITE 524288 /* 512 KiB */
|
||||||
|
#define MEMORY_ULTRA_LITE 262144 /* 256 KiB */
|
||||||
#define MEMORY_HEAVY 4194304 /* 4 MiB */
|
#define MEMORY_HEAVY 4194304 /* 4 MiB */
|
||||||
|
|
||||||
#define POW_DEFAULT_INDEX_SHIFT 3
|
#define POW_DEFAULT_INDEX_SHIFT 3
|
||||||
|
|
|
@ -212,4 +212,10 @@ const static uint8_t test_output_heavy_tube[96] = {
|
||||||
0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb
|
0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// CN-Ultralite/Turtle
|
||||||
|
const static uint8_t test_output_turtle[32] = {
|
||||||
|
0x08, 0xF4, 0x21, 0xD7, 0x83, 0x31, 0x17, 0x30, 0x0E, 0xDA, 0x66, 0xE9, 0x8F, 0x4A, 0x25, 0x69,
|
||||||
|
0x09, 0x3D, 0xF3, 0x00, 0x50, 0x01, 0x73, 0x94, 0x4E, 0xFC, 0x40, 0x1E, 0x9A, 0x4A, 0x17, 0xAF
|
||||||
|
};
|
||||||
|
|
||||||
#endif /* __CRYPTONIGHT_TEST_H__ */
|
#endif /* __CRYPTONIGHT_TEST_H__ */
|
||||||
|
|
|
@ -62,12 +62,17 @@ extern "C"
|
||||||
void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||||
void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||||
void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||||
|
void cn_ultralitev2_mainloop_ivybridge_asm(ScratchPad* ctx0);
|
||||||
|
void cn_ultralitev2_mainloop_ryzen_asm(ScratchPad* ctx0);
|
||||||
|
void cn_ultralitev2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||||
|
void cn_ultralitev2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||||
void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||||
void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||||
void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||||
void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||||
void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||||
void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||||
|
void cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -775,8 +780,7 @@ public:
|
||||||
uint64_t* h[NUM_HASH_BLOCKS];
|
uint64_t* h[NUM_HASH_BLOCKS];
|
||||||
uint64_t al[NUM_HASH_BLOCKS];
|
uint64_t al[NUM_HASH_BLOCKS];
|
||||||
uint64_t ah[NUM_HASH_BLOCKS];
|
uint64_t ah[NUM_HASH_BLOCKS];
|
||||||
uint64_t idx[NUM_HASH_BLOCKS];CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(
|
uint64_t idx[NUM_HASH_BLOCKS];
|
||||||
input, size, output, scratchPad);
|
|
||||||
uint64_t sqrt_result[NUM_HASH_BLOCKS];
|
uint64_t sqrt_result[NUM_HASH_BLOCKS];
|
||||||
__m128i bx0[NUM_HASH_BLOCKS];
|
__m128i bx0[NUM_HASH_BLOCKS];
|
||||||
__m128i bx1[NUM_HASH_BLOCKS];
|
__m128i bx1[NUM_HASH_BLOCKS];
|
||||||
|
@ -883,15 +887,6 @@ public:
|
||||||
// not supported
|
// not supported
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
|
||||||
size_t size,
|
|
||||||
uint8_t* __restrict__ output,
|
|
||||||
ScratchPad** __restrict__ scratchPad,
|
|
||||||
AsmOptimization asmOptimization)
|
|
||||||
{
|
|
||||||
// not supported
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||||
size_t size,
|
size_t size,
|
||||||
uint8_t* __restrict__ output,
|
uint8_t* __restrict__ output,
|
||||||
|
@ -1564,48 +1559,38 @@ public:
|
||||||
if (SOFT_AES) {
|
if (SOFT_AES) {
|
||||||
scratchPad[0]->input = input;
|
scratchPad[0]->input = input;
|
||||||
scratchPad[0]->t_fn = (const uint32_t*)saes_table;
|
scratchPad[0]->t_fn = (const uint32_t*)saes_table;
|
||||||
|
if (ITERATIONS == 0x40000) {
|
||||||
|
cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||||
|
} else if (ITERATIONS == 0x10000) {
|
||||||
|
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||||
|
} else {
|
||||||
cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (ITERATIONS == 0x10000) {
|
||||||
|
cn_ultralitev2_mainloop_ivybridge_asm(scratchPad[0]);
|
||||||
|
} else if (ITERATIONS == 0x40000) {
|
||||||
|
cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]);
|
||||||
} else {
|
} else {
|
||||||
cnv2_mainloop_ivybridge_asm(scratchPad[0]);
|
cnv2_mainloop_ivybridge_asm(scratchPad[0]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
|
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
|
||||||
|
if (ITERATIONS == 0x10000) {
|
||||||
|
cn_ultralitev2_mainloop_ryzen_asm(scratchPad[0]);
|
||||||
|
} else if (ITERATIONS == 0x40000) {
|
||||||
|
cn_fastv2_mainloop_ryzen_asm(scratchPad[0]);
|
||||||
|
} else {
|
||||||
cnv2_mainloop_ryzen_asm(scratchPad[0]);
|
cnv2_mainloop_ryzen_asm(scratchPad[0]);
|
||||||
|
}
|
||||||
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
|
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
|
||||||
|
if (ITERATIONS == 0x10000) {
|
||||||
|
cn_ultralitev2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||||
|
} else if (ITERATIONS == 0x40000) {
|
||||||
|
cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||||
|
} else {
|
||||||
cnv2_mainloop_bulldozer_asm(scratchPad[0]);
|
cnv2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||||
}
|
}
|
||||||
#endif
|
|
||||||
|
|
||||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
|
|
||||||
keccakf(h, 24);
|
|
||||||
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
|
||||||
}
|
|
||||||
|
|
||||||
// single asm
|
|
||||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
|
||||||
size_t size,
|
|
||||||
uint8_t* __restrict__ output,
|
|
||||||
ScratchPad** __restrict__ scratchPad,
|
|
||||||
AsmOptimization asmOptimization)
|
|
||||||
{
|
|
||||||
const uint8_t* l = scratchPad[0]->memory;
|
|
||||||
uint64_t* h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
|
|
||||||
|
|
||||||
keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
|
|
||||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
|
|
||||||
|
|
||||||
#ifndef XMRIG_NO_ASM
|
|
||||||
if (asmOptimization == AsmOptimization::ASM_INTEL) {
|
|
||||||
if (SOFT_AES) {
|
|
||||||
scratchPad[0]->input = input;
|
|
||||||
scratchPad[0]->t_fn = (const uint32_t*)saes_table;
|
|
||||||
cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
|
||||||
} else {
|
|
||||||
cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]);
|
|
||||||
}
|
|
||||||
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
|
|
||||||
cn_fastv2_mainloop_ryzen_asm(scratchPad[0]);
|
|
||||||
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
|
|
||||||
cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]);
|
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -1614,6 +1599,7 @@ public:
|
||||||
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||||
size_t size,
|
size_t size,
|
||||||
uint8_t* __restrict__ output,
|
uint8_t* __restrict__ output,
|
||||||
|
@ -2320,39 +2306,13 @@ public:
|
||||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
|
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
|
||||||
|
|
||||||
#ifndef XMRIG_NO_ASM
|
#ifndef XMRIG_NO_ASM
|
||||||
cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
if (ITERATIONS == 0x10000) {
|
||||||
#endif
|
cn_ultralitev2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||||
|
} else if (ITERATIONS == 0x40000) {
|
||||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
|
||||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
|
|
||||||
|
|
||||||
keccakf(h0, 24);
|
|
||||||
keccakf(h1, 24);
|
|
||||||
|
|
||||||
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
|
||||||
extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
|
|
||||||
}
|
|
||||||
|
|
||||||
// double asm
|
|
||||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
|
||||||
size_t size,
|
|
||||||
uint8_t* __restrict__ output,
|
|
||||||
ScratchPad** __restrict__ scratchPad,
|
|
||||||
AsmOptimization asmOptimization)
|
|
||||||
{
|
|
||||||
keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
|
|
||||||
keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
|
|
||||||
|
|
||||||
const uint8_t* l0 = scratchPad[0]->memory;
|
|
||||||
const uint8_t* l1 = scratchPad[1]->memory;
|
|
||||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
|
|
||||||
uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
|
|
||||||
|
|
||||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
|
|
||||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
|
|
||||||
|
|
||||||
#ifndef XMRIG_NO_ASM
|
|
||||||
cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||||
|
} else {
|
||||||
|
cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||||
|
@ -3312,15 +3272,6 @@ public:
|
||||||
// not supported
|
// not supported
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
|
||||||
size_t size,
|
|
||||||
uint8_t* __restrict__ output,
|
|
||||||
ScratchPad** __restrict__ scratchPad,
|
|
||||||
AsmOptimization asmOptimization)
|
|
||||||
{
|
|
||||||
// not supported
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||||
size_t size,
|
size_t size,
|
||||||
uint8_t* __restrict__ output,
|
uint8_t* __restrict__ output,
|
||||||
|
@ -4576,15 +4527,6 @@ public:
|
||||||
// not supported
|
// not supported
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
|
||||||
size_t size,
|
|
||||||
uint8_t* __restrict__ output,
|
|
||||||
ScratchPad** __restrict__ scratchPad,
|
|
||||||
AsmOptimization asmOptimization)
|
|
||||||
{
|
|
||||||
// not supported
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||||
size_t size,
|
size_t size,
|
||||||
uint8_t* __restrict__ output,
|
uint8_t* __restrict__ output,
|
||||||
|
@ -5510,15 +5452,6 @@ public:
|
||||||
// not supported
|
// not supported
|
||||||
}
|
}
|
||||||
|
|
||||||
inline static void hashPowFastV2_asm(const uint8_t* __restrict__ input,
|
|
||||||
size_t size,
|
|
||||||
uint8_t* __restrict__ output,
|
|
||||||
ScratchPad** __restrict__ scratchPad,
|
|
||||||
AsmOptimization asmOptimization)
|
|
||||||
{
|
|
||||||
// not supported
|
|
||||||
}
|
|
||||||
|
|
||||||
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
inline static void hashLiteTube(const uint8_t* __restrict__ input,
|
||||||
size_t size,
|
size_t size,
|
||||||
uint8_t* __restrict__ output,
|
uint8_t* __restrict__ output,
|
||||||
|
|
|
@ -19,6 +19,10 @@
|
||||||
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
|
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
|
||||||
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
|
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm)
|
||||||
|
|
||||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||||
|
@ -26,6 +30,7 @@
|
||||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm)
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
|
@ -173,6 +178,55 @@ FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm):
|
||||||
add rsp, 48
|
add rsp, 48
|
||||||
ret 0
|
ret 0
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm):
|
||||||
|
sub rsp, 48
|
||||||
|
mov rcx, rdi
|
||||||
|
#include "cn_ultralitev2_main_loop_ivybridge.inc"
|
||||||
|
add rsp, 48
|
||||||
|
ret 0
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm):
|
||||||
|
sub rsp, 48
|
||||||
|
mov rcx, rdi
|
||||||
|
mov rdx, rsi
|
||||||
|
#include "cn_ultralitev2_double_main_loop_sandybridge.inc"
|
||||||
|
add rsp, 48
|
||||||
|
ret 0
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm):
|
||||||
|
sub rsp, 48
|
||||||
|
mov rcx, rdi
|
||||||
|
#include "cn_ultralitev2_main_loop_ryzen.inc"
|
||||||
|
add rsp, 48
|
||||||
|
ret 0
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm):
|
||||||
|
sub rsp, 48
|
||||||
|
mov rcx, rdi
|
||||||
|
#include "cn_ultralitev2_main_loop_bulldozer.inc"
|
||||||
|
add rsp, 48
|
||||||
|
ret 0
|
||||||
|
|
||||||
#ifdef __APPLE__
|
#ifdef __APPLE__
|
||||||
ALIGN 16
|
ALIGN 16
|
||||||
#else
|
#else
|
||||||
|
@ -244,3 +298,15 @@ FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
|
||||||
#include "cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
#include "cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
||||||
add rsp, 48
|
add rsp, 48
|
||||||
ret 0
|
ret 0
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm):
|
||||||
|
sub rsp, 48
|
||||||
|
mov rcx, rdi
|
||||||
|
#include "cn_ultralitev2_mainloop_soft_aes_sandybridge.inc"
|
||||||
|
add rsp, 48
|
||||||
|
ret 0
|
414
src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc
Normal file
414
src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc
Normal file
|
@ -0,0 +1,414 @@
|
||||||
|
mov rax, rsp
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 184
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp+272]
|
||||||
|
mov DWORD PTR [rsp+276], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+276]
|
||||||
|
|
||||||
|
mov r13, QWORD PTR [rcx+224]
|
||||||
|
mov r9, rdx
|
||||||
|
mov r10, QWORD PTR [rcx+32]
|
||||||
|
mov r8, rcx
|
||||||
|
xor r10, QWORD PTR [rcx]
|
||||||
|
mov r14d, 65536
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rsi, QWORD PTR [rdx+224]
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
mov rdi, QWORD PTR [r9+32]
|
||||||
|
xor rdi, QWORD PTR [r9]
|
||||||
|
mov rbp, QWORD PTR [r9+40]
|
||||||
|
xor rbp, QWORD PTR [r9+8]
|
||||||
|
movq xmm0, rdx
|
||||||
|
movaps XMMWORD PTR [rax-88], xmm6
|
||||||
|
movaps XMMWORD PTR [rax-104], xmm7
|
||||||
|
movaps XMMWORD PTR [rax-120], xmm8
|
||||||
|
movaps XMMWORD PTR [rsp+112], xmm9
|
||||||
|
movaps XMMWORD PTR [rsp+96], xmm10
|
||||||
|
movaps XMMWORD PTR [rsp+80], xmm11
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm12
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm13
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm14
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm15
|
||||||
|
mov rdx, r10
|
||||||
|
movq xmm4, QWORD PTR [r8+96]
|
||||||
|
and edx, 131056
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
xorps xmm13, xmm13
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r8+72]
|
||||||
|
movq xmm5, QWORD PTR [r8+104]
|
||||||
|
movq xmm7, rax
|
||||||
|
|
||||||
|
mov eax, 1
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm14, rax
|
||||||
|
punpcklqdq xmm14, xmm14
|
||||||
|
|
||||||
|
mov eax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm12, rax
|
||||||
|
punpcklqdq xmm12, xmm12
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [r8+80]
|
||||||
|
xor rax, QWORD PTR [r8+64]
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [r9+56]
|
||||||
|
xor rcx, QWORD PTR [r9+24]
|
||||||
|
movq xmm3, rax
|
||||||
|
mov rax, QWORD PTR [r9+48]
|
||||||
|
xor rax, QWORD PTR [r9+16]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov QWORD PTR [rsp], r13
|
||||||
|
mov rcx, QWORD PTR [r9+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
movq xmm6, rax
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
punpcklqdq xmm6, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov QWORD PTR [rsp+256], r10
|
||||||
|
mov rcx, rdi
|
||||||
|
mov QWORD PTR [rsp+264], r11
|
||||||
|
movq xmm8, rax
|
||||||
|
and ecx, 131056
|
||||||
|
punpcklqdq xmm8, xmm0
|
||||||
|
movq xmm0, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
movq xmm0, QWORD PTR [r9+104]
|
||||||
|
lea r8, QWORD PTR [rcx+rsi]
|
||||||
|
movdqu xmm11, XMMWORD PTR [r8]
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
lea r9, QWORD PTR [rdx+r13]
|
||||||
|
movdqu xmm15, XMMWORD PTR [r9]
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
main_loop_double_ultralitev2_sandybridge:
|
||||||
|
movdqu xmm9, xmm15
|
||||||
|
mov eax, edx
|
||||||
|
mov ebx, edx
|
||||||
|
xor eax, 16
|
||||||
|
xor ebx, 32
|
||||||
|
xor edx, 48
|
||||||
|
|
||||||
|
movq xmm0, r11
|
||||||
|
movq xmm2, r10
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
aesenc xmm9, xmm2
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
paddq xmm1, xmm2
|
||||||
|
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||||
|
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||||
|
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||||
|
paddq xmm0, xmm3
|
||||||
|
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||||
|
|
||||||
|
movq r11, xmm9
|
||||||
|
mov edx, r11d
|
||||||
|
and edx, 131056
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
pxor xmm0, xmm7
|
||||||
|
movdqu XMMWORD PTR [r9], xmm0
|
||||||
|
|
||||||
|
lea rbx, QWORD PTR [rdx+r13]
|
||||||
|
mov r10, QWORD PTR [rdx+r13]
|
||||||
|
|
||||||
|
movdqu xmm10, xmm11
|
||||||
|
movq xmm0, rbp
|
||||||
|
movq xmm11, rdi
|
||||||
|
punpcklqdq xmm11, xmm0
|
||||||
|
aesenc xmm10, xmm11
|
||||||
|
|
||||||
|
mov eax, ecx
|
||||||
|
mov r12d, ecx
|
||||||
|
xor eax, 16
|
||||||
|
xor r12d, 32
|
||||||
|
xor ecx, 48
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||||
|
paddq xmm0, xmm6
|
||||||
|
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||||
|
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||||
|
paddq xmm1, xmm11
|
||||||
|
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||||
|
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||||
|
|
||||||
|
movq rcx, xmm10
|
||||||
|
and ecx, 131056
|
||||||
|
|
||||||
|
movdqa xmm0, xmm10
|
||||||
|
pxor xmm0, xmm6
|
||||||
|
movdqu XMMWORD PTR [r8], xmm0
|
||||||
|
mov r12, QWORD PTR [rcx+rsi]
|
||||||
|
|
||||||
|
mov r9, QWORD PTR [rbx+8]
|
||||||
|
|
||||||
|
xor edx, 16
|
||||||
|
mov r8d, edx
|
||||||
|
mov r15d, edx
|
||||||
|
|
||||||
|
movq rdx, xmm5
|
||||||
|
shl rdx, 32
|
||||||
|
movq rax, xmm4
|
||||||
|
xor rdx, rax
|
||||||
|
xor r10, rdx
|
||||||
|
mov rax, r10
|
||||||
|
mul r11
|
||||||
|
mov r11d, r8d
|
||||||
|
xor r11d, 48
|
||||||
|
movq xmm0, rdx
|
||||||
|
xor rdx, [r11+r13]
|
||||||
|
movq xmm1, rax
|
||||||
|
xor rax, [r11+r13+8]
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||||
|
xor r8d, 32
|
||||||
|
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
paddq xmm1, xmm2
|
||||||
|
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||||
|
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||||
|
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||||
|
paddq xmm0, xmm3
|
||||||
|
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||||
|
|
||||||
|
mov r11, QWORD PTR [rsp+256]
|
||||||
|
add r11, rdx
|
||||||
|
mov rdx, QWORD PTR [rsp+264]
|
||||||
|
add rdx, rax
|
||||||
|
mov QWORD PTR [rbx], r11
|
||||||
|
xor r11, r10
|
||||||
|
mov QWORD PTR [rbx+8], rdx
|
||||||
|
xor rdx, r9
|
||||||
|
mov QWORD PTR [rsp+256], r11
|
||||||
|
and r11d, 131056
|
||||||
|
mov QWORD PTR [rsp+264], rdx
|
||||||
|
mov QWORD PTR [rsp+8], r11
|
||||||
|
lea r15, QWORD PTR [r11+r13]
|
||||||
|
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||||
|
lea r13, QWORD PTR [rsi+rcx]
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movaps xmm2, xmm13
|
||||||
|
movq r10, xmm0
|
||||||
|
psllq xmm5, 1
|
||||||
|
shl r10, 32
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movdqa xmm1, xmm10
|
||||||
|
movq r11, xmm0
|
||||||
|
psrldq xmm1, 8
|
||||||
|
movq r8, xmm1
|
||||||
|
psrldq xmm4, 8
|
||||||
|
movaps xmm0, xmm13
|
||||||
|
movq rax, xmm4
|
||||||
|
xor r10, rax
|
||||||
|
movaps xmm1, xmm13
|
||||||
|
xor r10, r12
|
||||||
|
lea rax, QWORD PTR [r11+1]
|
||||||
|
shr rax, 1
|
||||||
|
movdqa xmm3, xmm9
|
||||||
|
punpcklqdq xmm3, xmm10
|
||||||
|
paddq xmm5, xmm3
|
||||||
|
movq rdx, xmm5
|
||||||
|
psrldq xmm5, 8
|
||||||
|
cvtsi2sd xmm2, rax
|
||||||
|
or edx, -2147483647
|
||||||
|
lea rax, QWORD PTR [r8+1]
|
||||||
|
shr rax, 1
|
||||||
|
movq r9, xmm5
|
||||||
|
cvtsi2sd xmm0, rax
|
||||||
|
or r9d, -2147483647
|
||||||
|
cvtsi2sd xmm1, rdx
|
||||||
|
unpcklpd xmm2, xmm0
|
||||||
|
movaps xmm0, xmm13
|
||||||
|
cvtsi2sd xmm0, r9
|
||||||
|
unpcklpd xmm1, xmm0
|
||||||
|
divpd xmm2, xmm1
|
||||||
|
paddq xmm2, xmm14
|
||||||
|
cvttsd2si rax, xmm2
|
||||||
|
psrldq xmm2, 8
|
||||||
|
mov rbx, rax
|
||||||
|
imul rax, rdx
|
||||||
|
sub r11, rax
|
||||||
|
js div_fix_1_ultralitev2_sandybridge
|
||||||
|
div_fix_1_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
cvttsd2si rdx, xmm2
|
||||||
|
mov rax, rdx
|
||||||
|
imul rax, r9
|
||||||
|
movd xmm2, r11d
|
||||||
|
movd xmm4, ebx
|
||||||
|
sub r8, rax
|
||||||
|
js div_fix_2_ultralitev2_sandybridge
|
||||||
|
div_fix_2_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
movd xmm1, r8d
|
||||||
|
movd xmm0, edx
|
||||||
|
punpckldq xmm2, xmm1
|
||||||
|
punpckldq xmm4, xmm0
|
||||||
|
punpckldq xmm4, xmm2
|
||||||
|
paddq xmm3, xmm4
|
||||||
|
movdqa xmm0, xmm3
|
||||||
|
psrlq xmm0, 12
|
||||||
|
paddq xmm0, xmm12
|
||||||
|
sqrtpd xmm1, xmm0
|
||||||
|
movq r9, xmm1
|
||||||
|
movdqa xmm5, xmm1
|
||||||
|
psrlq xmm5, 19
|
||||||
|
test r9, 524287
|
||||||
|
je sqrt_fix_1_ultralitev2_sandybridge
|
||||||
|
sqrt_fix_1_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
movq r9, xmm10
|
||||||
|
psrldq xmm1, 8
|
||||||
|
movq r8, xmm1
|
||||||
|
test r8, 524287
|
||||||
|
je sqrt_fix_2_ultralitev2_sandybridge
|
||||||
|
sqrt_fix_2_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
mov r12d, ecx
|
||||||
|
mov r8d, ecx
|
||||||
|
xor r12d, 16
|
||||||
|
xor r8d, 32
|
||||||
|
xor ecx, 48
|
||||||
|
mov rax, r10
|
||||||
|
mul r9
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm3, rdx
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||||
|
xor rdx, [r8+rsi]
|
||||||
|
xor rax, [r8+rsi+8]
|
||||||
|
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||||
|
paddq xmm0, xmm6
|
||||||
|
paddq xmm1, xmm11
|
||||||
|
paddq xmm3, xmm8
|
||||||
|
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||||
|
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||||
|
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||||
|
|
||||||
|
add rdi, rdx
|
||||||
|
mov QWORD PTR [r13], rdi
|
||||||
|
xor rdi, r10
|
||||||
|
mov ecx, edi
|
||||||
|
and ecx, 131056
|
||||||
|
lea r8, QWORD PTR [rcx+rsi]
|
||||||
|
|
||||||
|
mov rdx, QWORD PTR [r13+8]
|
||||||
|
add rbp, rax
|
||||||
|
mov QWORD PTR [r13+8], rbp
|
||||||
|
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||||
|
xor rbp, rdx
|
||||||
|
mov r13, QWORD PTR [rsp]
|
||||||
|
movdqa xmm3, xmm7
|
||||||
|
mov rdx, QWORD PTR [rsp+8]
|
||||||
|
movdqa xmm8, xmm6
|
||||||
|
mov r10, QWORD PTR [rsp+256]
|
||||||
|
movdqa xmm7, xmm9
|
||||||
|
mov r11, QWORD PTR [rsp+264]
|
||||||
|
movdqa xmm6, xmm10
|
||||||
|
mov r9, r15
|
||||||
|
dec r14d
|
||||||
|
jne main_loop_double_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp+272]
|
||||||
|
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+184]
|
||||||
|
movaps xmm6, XMMWORD PTR [r11-24]
|
||||||
|
movaps xmm7, XMMWORD PTR [r11-40]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-56]
|
||||||
|
movaps xmm9, XMMWORD PTR [r11-72]
|
||||||
|
movaps xmm10, XMMWORD PTR [r11-88]
|
||||||
|
movaps xmm11, XMMWORD PTR [r11-104]
|
||||||
|
movaps xmm12, XMMWORD PTR [r11-120]
|
||||||
|
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||||
|
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
||||||
|
jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp
|
||||||
|
|
||||||
|
div_fix_1_ultralitev2_sandybridge:
|
||||||
|
dec rbx
|
||||||
|
add r11, rdx
|
||||||
|
jmp div_fix_1_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
div_fix_2_ultralitev2_sandybridge:
|
||||||
|
dec rdx
|
||||||
|
add r8, r9
|
||||||
|
jmp div_fix_2_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
sqrt_fix_1_ultralitev2_sandybridge:
|
||||||
|
movq r8, xmm3
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
dec r9
|
||||||
|
mov r11d, -1022
|
||||||
|
shl r11, 32
|
||||||
|
mov rax, r9
|
||||||
|
shr r9, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rdx, r9
|
||||||
|
sub rdx, rax
|
||||||
|
lea rdx, [rdx+r11+1]
|
||||||
|
add rax, r11
|
||||||
|
imul rdx, rax
|
||||||
|
sub rdx, r8
|
||||||
|
adc r9, 0
|
||||||
|
movq xmm5, r9
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
jmp sqrt_fix_1_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
sqrt_fix_2_ultralitev2_sandybridge:
|
||||||
|
psrldq xmm3, 8
|
||||||
|
movq r11, xmm3
|
||||||
|
dec r8
|
||||||
|
mov ebx, -1022
|
||||||
|
shl rbx, 32
|
||||||
|
mov rax, r8
|
||||||
|
shr r8, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rdx, r8
|
||||||
|
sub rdx, rax
|
||||||
|
lea rdx, [rdx+rbx+1]
|
||||||
|
add rax, rbx
|
||||||
|
imul rdx, rax
|
||||||
|
sub rdx, r11
|
||||||
|
adc r8, 0
|
||||||
|
movq xmm0, r8
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
jmp sqrt_fix_2_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp:
|
180
src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc
Normal file
180
src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc
Normal file
|
@ -0,0 +1,180 @@
|
||||||
|
mov QWORD PTR [rsp+16], rbx
|
||||||
|
mov QWORD PTR [rsp+24], rbp
|
||||||
|
mov QWORD PTR [rsp+32], rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 64
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov ebp, 65536
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movq xmm3, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movq xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
mov rdi, QWORD PTR [r9+104]
|
||||||
|
and r10d, 131056
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm6
|
||||||
|
movq xmm4, rax
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm8
|
||||||
|
xorps xmm8, xmm8
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm7, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
|
||||||
|
ALIGN 16
|
||||||
|
cnv2_main_loop_ultralitev2_bulldozer:
|
||||||
|
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||||
|
movq xmm6, r8
|
||||||
|
pinsrq xmm6, r11, 1
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
lea r9, QWORD PTR [rdi+rdi]
|
||||||
|
shl rdi, 32
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
aesenc xmm5, xmm6
|
||||||
|
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
paddq xmm0, xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movaps xmm1, xmm8
|
||||||
|
mov rsi, r15
|
||||||
|
xor rsi, rdi
|
||||||
|
|
||||||
|
mov edi, 1023
|
||||||
|
shl rdi, 52
|
||||||
|
|
||||||
|
movq r14, xmm5
|
||||||
|
pextrq rax, xmm5, 1
|
||||||
|
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
mov r10, r14
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rsi, QWORD PTR [r10+rbx]
|
||||||
|
lea r12, QWORD PTR [r10+rbx]
|
||||||
|
mov r13, QWORD PTR [r10+rbx+8]
|
||||||
|
|
||||||
|
add r9d, r14d
|
||||||
|
or r9d, -2147483647
|
||||||
|
xor edx, edx
|
||||||
|
div r9
|
||||||
|
mov eax, eax
|
||||||
|
shl rdx, 32
|
||||||
|
lea r15, [rax+rdx]
|
||||||
|
lea rax, [r14+r15]
|
||||||
|
shr rax, 12
|
||||||
|
add rax, rdi
|
||||||
|
movq xmm0, rax
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movq rdi, xmm1
|
||||||
|
test rdi, 524287
|
||||||
|
je sqrt_fixup_ultralitev2_bulldozer
|
||||||
|
shr rdi, 19
|
||||||
|
|
||||||
|
sqrt_fixup_ultralitev2_bulldozer_ret:
|
||||||
|
mov rax, rsi
|
||||||
|
mul r14
|
||||||
|
movq xmm1, rax
|
||||||
|
movq xmm0, rdx
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
mov ecx, r10d
|
||||||
|
xor r9d, 16
|
||||||
|
xor ecx, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
pxor xmm2, xmm0
|
||||||
|
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
add r8, rdx
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r12], r8
|
||||||
|
xor r8, rsi
|
||||||
|
mov QWORD PTR [r12+8], r11
|
||||||
|
mov r10, r8
|
||||||
|
xor r11, r13
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa xmm3, xmm5
|
||||||
|
dec ebp
|
||||||
|
jne cnv2_main_loop_ultralitev2_bulldozer
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+64]
|
||||||
|
mov rbx, QWORD PTR [r11+56]
|
||||||
|
mov rbp, QWORD PTR [r11+64]
|
||||||
|
mov rsi, QWORD PTR [r11+72]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-48]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
jmp cnv2_main_loop_ultralitev2_bulldozer_endp
|
||||||
|
|
||||||
|
sqrt_fixup_ultralitev2_bulldozer:
|
||||||
|
movq r9, xmm5
|
||||||
|
add r9, r15
|
||||||
|
dec rdi
|
||||||
|
mov edx, -1022
|
||||||
|
shl rdx, 32
|
||||||
|
mov rax, rdi
|
||||||
|
shr rdi, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdi
|
||||||
|
sub rcx, rax
|
||||||
|
lea rcx, [rcx+rdx+1]
|
||||||
|
add rax, rdx
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdi, 0
|
||||||
|
jmp sqrt_fixup_ultralitev2_bulldozer_ret
|
||||||
|
|
||||||
|
cnv2_main_loop_ultralitev2_bulldozer_endp:
|
186
src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc
Normal file
186
src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc
Normal file
|
@ -0,0 +1,186 @@
|
||||||
|
mov QWORD PTR [rsp+24], rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 80
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov esi, 65536
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
mov r13d, -2147483647
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movq xmm4, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movq xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
movq xmm3, QWORD PTR [r9+104]
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm6
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm8
|
||||||
|
and r10d, 131056
|
||||||
|
movq xmm5, rax
|
||||||
|
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm8, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
$main_loop_ultralitev2_ivybridge:
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
mov rdi, r15
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movq xmm0, r11
|
||||||
|
movq xmm7, r8
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
aesenc xmm6, xmm7
|
||||||
|
movq rbp, xmm6
|
||||||
|
mov r9, rbp
|
||||||
|
and r9d, 131056
|
||||||
|
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
mov r10, r9
|
||||||
|
xor r10d, 32
|
||||||
|
movq rcx, xmm3
|
||||||
|
mov rax, rcx
|
||||||
|
shl rax, 32
|
||||||
|
xor rdi, rax
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
pxor xmm0, xmm4
|
||||||
|
movdqu XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rdi, QWORD PTR [r9+rbx]
|
||||||
|
lea r14, QWORD PTR [r9+rbx]
|
||||||
|
mov r12, QWORD PTR [r14+8]
|
||||||
|
xor edx, edx
|
||||||
|
lea r9d, DWORD PTR [ecx+ecx]
|
||||||
|
add r9d, ebp
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
psrldq xmm0, 8
|
||||||
|
or r9d, r13d
|
||||||
|
movq rax, xmm0
|
||||||
|
div r9
|
||||||
|
xorps xmm3, xmm3
|
||||||
|
mov eax, eax
|
||||||
|
shl rdx, 32
|
||||||
|
add rdx, rax
|
||||||
|
lea r9, QWORD PTR [rdx+rbp]
|
||||||
|
mov r15, rdx
|
||||||
|
mov rax, r9
|
||||||
|
shr rax, 12
|
||||||
|
movq xmm0, rax
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
sqrtsd xmm3, xmm0
|
||||||
|
movq rdx, xmm3
|
||||||
|
test edx, 524287
|
||||||
|
je $sqrt_fixup_ultralitev2_ivybridge
|
||||||
|
psrlq xmm3, 19
|
||||||
|
$sqrt_fixup_ultralitev2_ivybridge_ret:
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov rax, rdi
|
||||||
|
mul rbp
|
||||||
|
movq xmm2, rdx
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
add r8, rdx
|
||||||
|
mov QWORD PTR [r14], r8
|
||||||
|
xor r8, rdi
|
||||||
|
mov edi, r8d
|
||||||
|
and edi, 131056
|
||||||
|
movq xmm0, rax
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r14+8], r11
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
xor r9d, 48
|
||||||
|
xor r10d, 16
|
||||||
|
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||||
|
movdqa xmm4, xmm6
|
||||||
|
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
movdqu xmm6, [rdi+rbx]
|
||||||
|
mov r10d, edi
|
||||||
|
xor r11, r12
|
||||||
|
dec rsi
|
||||||
|
jne $main_loop_ultralitev2_ivybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
mov rbx, QWORD PTR [rsp+160]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||||
|
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||||
|
add rsp, 80
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
jmp $cnv2_main_loop_ultralitev2_ivybridge_endp
|
||||||
|
|
||||||
|
$sqrt_fixup_ultralitev2_ivybridge:
|
||||||
|
dec rdx
|
||||||
|
mov r13d, -1022
|
||||||
|
shl r13, 32
|
||||||
|
mov rax, rdx
|
||||||
|
shr rdx, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
add rax, r13
|
||||||
|
not r13
|
||||||
|
sub rcx, r13
|
||||||
|
mov r13d, -2147483647
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdx, 0
|
||||||
|
movq xmm3, rdx
|
||||||
|
jmp $sqrt_fixup_ultralitev2_ivybridge_ret
|
||||||
|
|
||||||
|
$cnv2_main_loop_ultralitev2_ivybridge_endp:
|
183
src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc
Normal file
183
src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc
Normal file
|
@ -0,0 +1,183 @@
|
||||||
|
mov QWORD PTR [rsp+16], rbx
|
||||||
|
mov QWORD PTR [rsp+24], rbp
|
||||||
|
mov QWORD PTR [rsp+32], rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 64
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov ebp, 65536
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movq xmm3, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movq xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
mov rdi, QWORD PTR [r9+104]
|
||||||
|
and r10d, 131056
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm6
|
||||||
|
movq xmm4, rax
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm8
|
||||||
|
xorps xmm8, xmm8
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm7, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
$main_loop_ultralitev2_ryzen:
|
||||||
|
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||||
|
movq xmm0, r11
|
||||||
|
movq xmm6, r8
|
||||||
|
punpcklqdq xmm6, xmm0
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
lea r9, QWORD PTR [rdi+rdi]
|
||||||
|
shl rdi, 32
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
aesenc xmm5, xmm6
|
||||||
|
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
paddq xmm0, xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movaps xmm1, xmm8
|
||||||
|
mov rsi, r15
|
||||||
|
xor rsi, rdi
|
||||||
|
movq r14, xmm5
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
mov r10, r14
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rsi, QWORD PTR [r10+rbx]
|
||||||
|
lea r12, QWORD PTR [r10+rbx]
|
||||||
|
mov r13, QWORD PTR [r10+rbx+8]
|
||||||
|
|
||||||
|
add r9d, r14d
|
||||||
|
or r9d, -2147483647
|
||||||
|
xor edx, edx
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movq rax, xmm0
|
||||||
|
|
||||||
|
div r9
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm1, rdx
|
||||||
|
punpckldq xmm0, xmm1
|
||||||
|
movq r15, xmm0
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqa xmm2, xmm0
|
||||||
|
psrlq xmm0, 12
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movq rdi, xmm1
|
||||||
|
test rdi, 524287
|
||||||
|
je $sqrt_fixup_ultralitev2_ryzen
|
||||||
|
shr rdi, 19
|
||||||
|
|
||||||
|
$sqrt_fixup_ultralitev2_ryzen_ret:
|
||||||
|
mov rax, rsi
|
||||||
|
mul r14
|
||||||
|
movq xmm1, rax
|
||||||
|
movq xmm0, rdx
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
mov ecx, r10d
|
||||||
|
xor r9d, 16
|
||||||
|
xor ecx, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
pxor xmm2, xmm0
|
||||||
|
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
add r8, rdx
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r12], r8
|
||||||
|
xor r8, rsi
|
||||||
|
mov QWORD PTR [r12+8], r11
|
||||||
|
mov r10, r8
|
||||||
|
xor r11, r13
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa xmm3, xmm5
|
||||||
|
dec ebp
|
||||||
|
jne $main_loop_ultralitev2_ryzen
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+64]
|
||||||
|
mov rbx, QWORD PTR [r11+56]
|
||||||
|
mov rbp, QWORD PTR [r11+64]
|
||||||
|
mov rsi, QWORD PTR [r11+72]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-48]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
jmp $cnv2_main_loop_ultralitev2_ryzen_endp
|
||||||
|
|
||||||
|
$sqrt_fixup_ultralitev2_ryzen:
|
||||||
|
movq r9, xmm2
|
||||||
|
dec rdi
|
||||||
|
mov edx, -1022
|
||||||
|
shl rdx, 32
|
||||||
|
mov rax, rdi
|
||||||
|
shr rdi, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdi
|
||||||
|
sub rcx, rax
|
||||||
|
lea rcx, [rcx+rdx+1]
|
||||||
|
add rax, rdx
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdi, 0
|
||||||
|
jmp $sqrt_fixup_ultralitev2_ryzen_ret
|
||||||
|
|
||||||
|
$cnv2_main_loop_ultralitev2_ryzen_endp:
|
271
src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc
Normal file
271
src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc
Normal file
|
@ -0,0 +1,271 @@
|
||||||
|
mov QWORD PTR [rsp+8], rcx
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 152
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp+4]
|
||||||
|
mov DWORD PTR [rsp], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r10, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r9, QWORD PTR [rcx+40]
|
||||||
|
xor r9, QWORD PTR [rcx+8]
|
||||||
|
movq xmm4, rax
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
mov r11, QWORD PTR [rcx+224]
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r10+72]
|
||||||
|
mov rax, QWORD PTR [r10+80]
|
||||||
|
movq xmm0, rdx
|
||||||
|
xor rax, QWORD PTR [r10+64]
|
||||||
|
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm6
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm8
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm9
|
||||||
|
movaps XMMWORD PTR [rsp+80], xmm10
|
||||||
|
movaps XMMWORD PTR [rsp+96], xmm11
|
||||||
|
movaps XMMWORD PTR [rsp+112], xmm12
|
||||||
|
movaps XMMWORD PTR [rsp+128], xmm13
|
||||||
|
|
||||||
|
movq xmm5, rax
|
||||||
|
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm8, rax
|
||||||
|
|
||||||
|
mov rax, r8
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
and eax, 131056
|
||||||
|
movq xmm10, QWORD PTR [r10+96]
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [r10+104]
|
||||||
|
xorps xmm9, xmm9
|
||||||
|
mov QWORD PTR [rsp+248], rax
|
||||||
|
movq xmm12, r11
|
||||||
|
mov QWORD PTR [rsp+240], r9
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
movq xmm13, rcx
|
||||||
|
mov r12d, 262144
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
ALIGN 16
|
||||||
|
#else
|
||||||
|
ALIGN 64
|
||||||
|
#endif
|
||||||
|
cnv2_mainloop_soft_aes_ultralitev2_sandybridge:
|
||||||
|
movd xmm11, r12d
|
||||||
|
mov r12, QWORD PTR [r10+272]
|
||||||
|
lea r13, QWORD PTR [rax+r11]
|
||||||
|
mov esi, DWORD PTR [r13]
|
||||||
|
movq xmm0, r9
|
||||||
|
mov r10d, DWORD PTR [r13+4]
|
||||||
|
movq xmm7, r8
|
||||||
|
mov ebp, DWORD PTR [r13+12]
|
||||||
|
mov r14d, DWORD PTR [r13+8]
|
||||||
|
mov rdx, QWORD PTR [rsp+248]
|
||||||
|
movzx ecx, sil
|
||||||
|
shr esi, 8
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
mov r15d, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, r10b
|
||||||
|
shr r10d, 8
|
||||||
|
mov edi, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, r14b
|
||||||
|
shr r14d, 8
|
||||||
|
mov ebx, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, bpl
|
||||||
|
shr ebp, 8
|
||||||
|
mov r9d, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, r10b
|
||||||
|
shr r10d, 8
|
||||||
|
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
movzx ecx, r14b
|
||||||
|
shr r14d, 8
|
||||||
|
mov eax, r14d
|
||||||
|
shr eax, 8
|
||||||
|
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
add eax, 256
|
||||||
|
movzx ecx, bpl
|
||||||
|
shr ebp, 8
|
||||||
|
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
movzx ecx, sil
|
||||||
|
shr esi, 8
|
||||||
|
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
add r12, 2048
|
||||||
|
movzx ecx, r10b
|
||||||
|
shr r10d, 8
|
||||||
|
add r10d, 256
|
||||||
|
mov r11d, DWORD PTR [r12+rax*4]
|
||||||
|
xor r11d, DWORD PTR [r12+rcx*4]
|
||||||
|
xor r11d, r9d
|
||||||
|
movzx ecx, sil
|
||||||
|
mov r10d, DWORD PTR [r12+r10*4]
|
||||||
|
shr esi, 8
|
||||||
|
add esi, 256
|
||||||
|
xor r10d, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, bpl
|
||||||
|
xor r10d, ebx
|
||||||
|
shr ebp, 8
|
||||||
|
movd xmm1, r11d
|
||||||
|
add ebp, 256
|
||||||
|
movq r11, xmm12
|
||||||
|
mov r9d, DWORD PTR [r12+rcx*4]
|
||||||
|
xor r9d, DWORD PTR [r12+rsi*4]
|
||||||
|
mov eax, DWORD PTR [r12+rbp*4]
|
||||||
|
xor r9d, edi
|
||||||
|
movzx ecx, r14b
|
||||||
|
movd xmm0, r10d
|
||||||
|
movd xmm2, r9d
|
||||||
|
xor eax, DWORD PTR [r12+rcx*4]
|
||||||
|
mov rcx, rdx
|
||||||
|
xor eax, r15d
|
||||||
|
punpckldq xmm2, xmm1
|
||||||
|
xor rcx, 16
|
||||||
|
movd xmm6, eax
|
||||||
|
mov rax, rdx
|
||||||
|
punpckldq xmm6, xmm0
|
||||||
|
xor rax, 32
|
||||||
|
punpckldq xmm6, xmm2
|
||||||
|
xor rdx, 48
|
||||||
|
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||||
|
pxor xmm6, xmm7
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||||
|
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||||
|
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||||
|
movq rcx, xmm13
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||||
|
movq rdi, xmm6
|
||||||
|
mov r10, rdi
|
||||||
|
and r10d, 131056
|
||||||
|
xor edx, edx
|
||||||
|
mov rax, rcx
|
||||||
|
shl rax, 32
|
||||||
|
movq rbx, xmm10
|
||||||
|
xor rbx, rax
|
||||||
|
lea r9, QWORD PTR [rcx+rcx]
|
||||||
|
add r9d, edi
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
pxor xmm0, xmm4
|
||||||
|
mov ecx, -2147483647
|
||||||
|
movdqu XMMWORD PTR [r13], xmm0
|
||||||
|
or r9, rcx
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
movaps xmm1, xmm9
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movq rax, xmm0
|
||||||
|
xor rbx, QWORD PTR [r10+r11]
|
||||||
|
lea r14, QWORD PTR [r10+r11]
|
||||||
|
mov rbp, QWORD PTR [r14+8]
|
||||||
|
div r9
|
||||||
|
shl rdx, 32
|
||||||
|
mov eax, eax
|
||||||
|
add rdx, rax
|
||||||
|
lea r9, QWORD PTR [rdx+rdi]
|
||||||
|
movq xmm10, rdx
|
||||||
|
mov rax, r9
|
||||||
|
shr rax, 12
|
||||||
|
movq xmm0, rax
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movq rdx, xmm1
|
||||||
|
test rdx, 524287
|
||||||
|
je sqrt_fixup_soft_aes_ultralitev2_sandybridge
|
||||||
|
psrlq xmm1, 19
|
||||||
|
sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret:
|
||||||
|
|
||||||
|
mov r9, r10
|
||||||
|
movdqa xmm13, xmm1
|
||||||
|
xor r9, 16
|
||||||
|
mov rcx, r10
|
||||||
|
xor rcx, 32
|
||||||
|
xor r10, 48
|
||||||
|
mov rax, rbx
|
||||||
|
mul rdi
|
||||||
|
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm3, rdx
|
||||||
|
xor rax, QWORD PTR [r11+rcx+8]
|
||||||
|
xor rdx, QWORD PTR [rcx+r11]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
add r8, rdx
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||||
|
pxor xmm2, xmm3
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
mov r9, QWORD PTR [rsp+240]
|
||||||
|
movdqa xmm4, xmm6
|
||||||
|
add r9, rax
|
||||||
|
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||||
|
mov r10, QWORD PTR [rsp+224]
|
||||||
|
movd r12d, xmm11
|
||||||
|
mov QWORD PTR [r14], r8
|
||||||
|
xor r8, rbx
|
||||||
|
mov rax, r8
|
||||||
|
mov QWORD PTR [r14+8], r9
|
||||||
|
and eax, 131056
|
||||||
|
xor r9, rbp
|
||||||
|
mov QWORD PTR [rsp+240], r9
|
||||||
|
mov QWORD PTR [rsp+248], rax
|
||||||
|
sub r12d, 1
|
||||||
|
jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||||
|
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||||
|
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||||
|
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||||
|
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||||
|
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||||
|
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||||
|
|
||||||
|
add rsp, 152
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
||||||
|
jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp
|
||||||
|
|
||||||
|
sqrt_fixup_soft_aes_ultralitev2_sandybridge:
|
||||||
|
dec rdx
|
||||||
|
mov r15d, -1022
|
||||||
|
shl r15, 32
|
||||||
|
mov rax, rdx
|
||||||
|
shr rdx, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
lea rcx, [rcx+r15+1]
|
||||||
|
add rax, r15
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdx, 0
|
||||||
|
movq xmm1, rdx
|
||||||
|
jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret
|
||||||
|
|
||||||
|
cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp:
|
|
@ -11,6 +11,10 @@ PUBLIC cn_fastv2_mainloop_ryzen_asm
|
||||||
PUBLIC cn_fastv2_mainloop_bulldozer_asm
|
PUBLIC cn_fastv2_mainloop_bulldozer_asm
|
||||||
PUBLIC cn_fastv2_double_mainloop_sandybridge_asm
|
PUBLIC cn_fastv2_double_mainloop_sandybridge_asm
|
||||||
PUBLIC cn_liteupx_mainloop_sandybridge_asm
|
PUBLIC cn_liteupx_mainloop_sandybridge_asm
|
||||||
|
PUBLIC cn_ultralitev2_mainloop_ivybridge_asm
|
||||||
|
PUBLIC cn_ultralitev2_mainloop_ryzen_asm
|
||||||
|
PUBLIC cn_ultralitev2_mainloop_bulldozer_asm
|
||||||
|
PUBLIC cn_ultralitev2_double_mainloop_sandybridge_asm
|
||||||
|
|
||||||
PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm
|
PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm
|
||||||
PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm
|
PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm
|
||||||
|
@ -18,6 +22,7 @@ PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm
|
||||||
PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm
|
PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm
|
||||||
PUBLIC cn_fastv2_mainloop_soft_aes_sandybridge_asm
|
PUBLIC cn_fastv2_mainloop_soft_aes_sandybridge_asm
|
||||||
PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm
|
PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm
|
||||||
|
PUBLIC cn_ultralitev2_mainloop_soft_aes_sandybridge_asm
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv1_mainloop_sandybridge_asm PROC
|
cnv1_mainloop_sandybridge_asm PROC
|
||||||
|
@ -91,6 +96,30 @@ cn_liteupx_mainloop_sandybridge_asm PROC
|
||||||
ret 0
|
ret 0
|
||||||
cn_liteupx_mainloop_sandybridge_asm ENDP
|
cn_liteupx_mainloop_sandybridge_asm ENDP
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cn_ultralitev2_mainloop_ivybridge_asm PROC
|
||||||
|
INCLUDE cn_ultralitev2_main_loop_ivybridge.inc
|
||||||
|
ret 0
|
||||||
|
cn_ultralitev2_mainloop_ivybridge_asm ENDP
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cn_ultralitev2_mainloop_ryzen_asm PROC
|
||||||
|
INCLUDE cn_ultralitev2_main_loop_ryzen.inc
|
||||||
|
ret 0
|
||||||
|
cn_ultralitev2_mainloop_ryzen_asm ENDP
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cn_ultralitev2_mainloop_bulldozer_asm PROC
|
||||||
|
INCLUDE cn_ultralitev2_main_loop_bulldozer.inc
|
||||||
|
ret 0
|
||||||
|
cn_ultralitev2_mainloop_bulldozer_asm ENDP
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cn_ultralitev2_double_mainloop_sandybridge_asm PROC
|
||||||
|
INCLUDE cn_ultralitev2_double_main_loop_sandybridge.inc
|
||||||
|
ret 0
|
||||||
|
cn_ultralitev2_double_mainloop_sandybridge_asm ENDP
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
cnv1_mainloop_soft_aes_sandybridge_asm PROC
|
cnv1_mainloop_soft_aes_sandybridge_asm PROC
|
||||||
INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc
|
INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc
|
||||||
|
@ -127,5 +156,11 @@ cn_liteupx_mainloop_soft_aes_sandybridge_asm PROC
|
||||||
ret 0
|
ret 0
|
||||||
cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP
|
cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm PROC
|
||||||
|
INCLUDE cn_ultralitev2_mainloop_soft_aes_sandybridge.inc
|
||||||
|
ret 0
|
||||||
|
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm ENDP
|
||||||
|
|
||||||
_TEXT_CN_MAINLOOP ENDS
|
_TEXT_CN_MAINLOOP ENDS
|
||||||
END
|
END
|
|
@ -15,6 +15,10 @@
|
||||||
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
|
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
|
||||||
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
|
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm)
|
||||||
|
|
||||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||||
|
@ -22,6 +26,7 @@
|
||||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
|
||||||
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
||||||
|
.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm)
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
FN_PREFIX(cnv1_mainloop_sandybridge_asm):
|
FN_PREFIX(cnv1_mainloop_sandybridge_asm):
|
||||||
|
@ -83,6 +88,26 @@ FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm):
|
||||||
#include "../cn_liteupx_mainloop_sandybridge.inc"
|
#include "../cn_liteupx_mainloop_sandybridge.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm):
|
||||||
|
#include "../cn_ultralitev2_main_loop_ivybridge.inc"
|
||||||
|
ret 0
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm):
|
||||||
|
#include "../cn_ultralitev2_main_loop_ryzen.inc"
|
||||||
|
ret 0
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm):
|
||||||
|
#include "../cn_ultralitev2_main_loop_bulldozer.inc"
|
||||||
|
ret 0
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm):
|
||||||
|
#include "../cn_ultralitev2_double_main_loop_sandybridge.inc"
|
||||||
|
ret 0
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm):
|
FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm):
|
||||||
#include "../cnv1_mainloop_soft_aes_sandybridge.inc"
|
#include "../cnv1_mainloop_soft_aes_sandybridge.inc"
|
||||||
|
@ -108,8 +133,12 @@ FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm):
|
||||||
#include "../cn_fastv2_mainloop_soft_aes_sandybridge.inc"
|
#include "../cn_fastv2_mainloop_soft_aes_sandybridge.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
|
||||||
|
|
||||||
ALIGN 64
|
ALIGN 64
|
||||||
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
|
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
|
||||||
#include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
#include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
||||||
ret 0
|
ret 0
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm):
|
||||||
|
#include "../cn_ultralitev2_mainloop_soft_aes_sandybridge.inc"
|
||||||
|
ret 0
|
|
@ -0,0 +1,410 @@
|
||||||
|
mov rax, rsp
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 184
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp+272]
|
||||||
|
mov DWORD PTR [rsp+276], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+276]
|
||||||
|
|
||||||
|
mov r13, QWORD PTR [rcx+224]
|
||||||
|
mov r9, rdx
|
||||||
|
mov r10, QWORD PTR [rcx+32]
|
||||||
|
mov r8, rcx
|
||||||
|
xor r10, QWORD PTR [rcx]
|
||||||
|
mov r14d, 65536
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rsi, QWORD PTR [rdx+224]
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
mov rdi, QWORD PTR [r9+32]
|
||||||
|
xor rdi, QWORD PTR [r9]
|
||||||
|
mov rbp, QWORD PTR [r9+40]
|
||||||
|
xor rbp, QWORD PTR [r9+8]
|
||||||
|
movq xmm0, rdx
|
||||||
|
movaps XMMWORD PTR [rax-88], xmm6
|
||||||
|
movaps XMMWORD PTR [rax-104], xmm7
|
||||||
|
movaps XMMWORD PTR [rax-120], xmm8
|
||||||
|
movaps XMMWORD PTR [rsp+112], xmm9
|
||||||
|
movaps XMMWORD PTR [rsp+96], xmm10
|
||||||
|
movaps XMMWORD PTR [rsp+80], xmm11
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm12
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm13
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm14
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm15
|
||||||
|
mov rdx, r10
|
||||||
|
movq xmm4, QWORD PTR [r8+96]
|
||||||
|
and edx, 131056
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
xorps xmm13, xmm13
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r8+72]
|
||||||
|
movq xmm5, QWORD PTR [r8+104]
|
||||||
|
movq xmm7, rax
|
||||||
|
|
||||||
|
mov eax, 1
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm14, rax
|
||||||
|
punpcklqdq xmm14, xmm14
|
||||||
|
|
||||||
|
mov eax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm12, rax
|
||||||
|
punpcklqdq xmm12, xmm12
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [r8+80]
|
||||||
|
xor rax, QWORD PTR [r8+64]
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [r9+56]
|
||||||
|
xor rcx, QWORD PTR [r9+24]
|
||||||
|
movq xmm3, rax
|
||||||
|
mov rax, QWORD PTR [r9+48]
|
||||||
|
xor rax, QWORD PTR [r9+16]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov QWORD PTR [rsp], r13
|
||||||
|
mov rcx, QWORD PTR [r9+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
movq xmm6, rax
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
punpcklqdq xmm6, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov QWORD PTR [rsp+256], r10
|
||||||
|
mov rcx, rdi
|
||||||
|
mov QWORD PTR [rsp+264], r11
|
||||||
|
movq xmm8, rax
|
||||||
|
and ecx, 131056
|
||||||
|
punpcklqdq xmm8, xmm0
|
||||||
|
movq xmm0, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
movq xmm0, QWORD PTR [r9+104]
|
||||||
|
lea r8, QWORD PTR [rcx+rsi]
|
||||||
|
movdqu xmm11, XMMWORD PTR [r8]
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
lea r9, QWORD PTR [rdx+r13]
|
||||||
|
movdqu xmm15, XMMWORD PTR [r9]
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
main_loop_double_ultralitev2_sandybridge:
|
||||||
|
movdqu xmm9, xmm15
|
||||||
|
mov eax, edx
|
||||||
|
mov ebx, edx
|
||||||
|
xor eax, 16
|
||||||
|
xor ebx, 32
|
||||||
|
xor edx, 48
|
||||||
|
|
||||||
|
movq xmm0, r11
|
||||||
|
movq xmm2, r10
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
aesenc xmm9, xmm2
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
paddq xmm1, xmm2
|
||||||
|
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||||
|
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||||
|
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||||
|
paddq xmm0, xmm3
|
||||||
|
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||||
|
|
||||||
|
movq r11, xmm9
|
||||||
|
mov edx, r11d
|
||||||
|
and edx, 131056
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
pxor xmm0, xmm7
|
||||||
|
movdqu XMMWORD PTR [r9], xmm0
|
||||||
|
|
||||||
|
lea rbx, QWORD PTR [rdx+r13]
|
||||||
|
mov r10, QWORD PTR [rdx+r13]
|
||||||
|
|
||||||
|
movdqu xmm10, xmm11
|
||||||
|
movq xmm0, rbp
|
||||||
|
movq xmm11, rdi
|
||||||
|
punpcklqdq xmm11, xmm0
|
||||||
|
aesenc xmm10, xmm11
|
||||||
|
|
||||||
|
mov eax, ecx
|
||||||
|
mov r12d, ecx
|
||||||
|
xor eax, 16
|
||||||
|
xor r12d, 32
|
||||||
|
xor ecx, 48
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||||
|
paddq xmm0, xmm6
|
||||||
|
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||||
|
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||||
|
paddq xmm1, xmm11
|
||||||
|
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||||
|
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||||
|
|
||||||
|
movq rcx, xmm10
|
||||||
|
and ecx, 131056
|
||||||
|
|
||||||
|
movdqa xmm0, xmm10
|
||||||
|
pxor xmm0, xmm6
|
||||||
|
movdqu XMMWORD PTR [r8], xmm0
|
||||||
|
mov r12, QWORD PTR [rcx+rsi]
|
||||||
|
|
||||||
|
mov r9, QWORD PTR [rbx+8]
|
||||||
|
|
||||||
|
xor edx, 16
|
||||||
|
mov r8d, edx
|
||||||
|
mov r15d, edx
|
||||||
|
|
||||||
|
movq rdx, xmm5
|
||||||
|
shl rdx, 32
|
||||||
|
movq rax, xmm4
|
||||||
|
xor rdx, rax
|
||||||
|
xor r10, rdx
|
||||||
|
mov rax, r10
|
||||||
|
mul r11
|
||||||
|
mov r11d, r8d
|
||||||
|
xor r11d, 48
|
||||||
|
movq xmm0, rdx
|
||||||
|
xor rdx, [r11+r13]
|
||||||
|
movq xmm1, rax
|
||||||
|
xor rax, [r11+r13+8]
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||||
|
xor r8d, 32
|
||||||
|
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
paddq xmm1, xmm2
|
||||||
|
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||||
|
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||||
|
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||||
|
paddq xmm0, xmm3
|
||||||
|
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||||
|
|
||||||
|
mov r11, QWORD PTR [rsp+256]
|
||||||
|
add r11, rdx
|
||||||
|
mov rdx, QWORD PTR [rsp+264]
|
||||||
|
add rdx, rax
|
||||||
|
mov QWORD PTR [rbx], r11
|
||||||
|
xor r11, r10
|
||||||
|
mov QWORD PTR [rbx+8], rdx
|
||||||
|
xor rdx, r9
|
||||||
|
mov QWORD PTR [rsp+256], r11
|
||||||
|
and r11d, 131056
|
||||||
|
mov QWORD PTR [rsp+264], rdx
|
||||||
|
mov QWORD PTR [rsp+8], r11
|
||||||
|
lea r15, QWORD PTR [r11+r13]
|
||||||
|
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||||
|
lea r13, QWORD PTR [rsi+rcx]
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movaps xmm2, xmm13
|
||||||
|
movq r10, xmm0
|
||||||
|
psllq xmm5, 1
|
||||||
|
shl r10, 32
|
||||||
|
movdqa xmm0, xmm9
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movdqa xmm1, xmm10
|
||||||
|
movq r11, xmm0
|
||||||
|
psrldq xmm1, 8
|
||||||
|
movq r8, xmm1
|
||||||
|
psrldq xmm4, 8
|
||||||
|
movaps xmm0, xmm13
|
||||||
|
movq rax, xmm4
|
||||||
|
xor r10, rax
|
||||||
|
movaps xmm1, xmm13
|
||||||
|
xor r10, r12
|
||||||
|
lea rax, QWORD PTR [r11+1]
|
||||||
|
shr rax, 1
|
||||||
|
movdqa xmm3, xmm9
|
||||||
|
punpcklqdq xmm3, xmm10
|
||||||
|
paddq xmm5, xmm3
|
||||||
|
movq rdx, xmm5
|
||||||
|
psrldq xmm5, 8
|
||||||
|
cvtsi2sd xmm2, rax
|
||||||
|
or edx, -2147483647
|
||||||
|
lea rax, QWORD PTR [r8+1]
|
||||||
|
shr rax, 1
|
||||||
|
movq r9, xmm5
|
||||||
|
cvtsi2sd xmm0, rax
|
||||||
|
or r9d, -2147483647
|
||||||
|
cvtsi2sd xmm1, rdx
|
||||||
|
unpcklpd xmm2, xmm0
|
||||||
|
movaps xmm0, xmm13
|
||||||
|
cvtsi2sd xmm0, r9
|
||||||
|
unpcklpd xmm1, xmm0
|
||||||
|
divpd xmm2, xmm1
|
||||||
|
paddq xmm2, xmm14
|
||||||
|
cvttsd2si rax, xmm2
|
||||||
|
psrldq xmm2, 8
|
||||||
|
mov rbx, rax
|
||||||
|
imul rax, rdx
|
||||||
|
sub r11, rax
|
||||||
|
js div_fix_1_ultralitev2_sandybridge
|
||||||
|
div_fix_1_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
cvttsd2si rdx, xmm2
|
||||||
|
mov rax, rdx
|
||||||
|
imul rax, r9
|
||||||
|
movd xmm2, r11d
|
||||||
|
movd xmm4, ebx
|
||||||
|
sub r8, rax
|
||||||
|
js div_fix_2_ultralitev2_sandybridge
|
||||||
|
div_fix_2_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
movd xmm1, r8d
|
||||||
|
movd xmm0, edx
|
||||||
|
punpckldq xmm2, xmm1
|
||||||
|
punpckldq xmm4, xmm0
|
||||||
|
punpckldq xmm4, xmm2
|
||||||
|
paddq xmm3, xmm4
|
||||||
|
movdqa xmm0, xmm3
|
||||||
|
psrlq xmm0, 12
|
||||||
|
paddq xmm0, xmm12
|
||||||
|
sqrtpd xmm1, xmm0
|
||||||
|
movq r9, xmm1
|
||||||
|
movdqa xmm5, xmm1
|
||||||
|
psrlq xmm5, 19
|
||||||
|
test r9, 524287
|
||||||
|
je sqrt_fix_1_ultralitev2_sandybridge
|
||||||
|
sqrt_fix_1_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
movq r9, xmm10
|
||||||
|
psrldq xmm1, 8
|
||||||
|
movq r8, xmm1
|
||||||
|
test r8, 524287
|
||||||
|
je sqrt_fix_2_ultralitev2_sandybridge
|
||||||
|
sqrt_fix_2_ret_ultralitev2_sandybridge:
|
||||||
|
|
||||||
|
mov r12d, ecx
|
||||||
|
mov r8d, ecx
|
||||||
|
xor r12d, 16
|
||||||
|
xor r8d, 32
|
||||||
|
xor ecx, 48
|
||||||
|
mov rax, r10
|
||||||
|
mul r9
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm3, rdx
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
|
||||||
|
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||||
|
xor rdx, [r8+rsi]
|
||||||
|
xor rax, [r8+rsi+8]
|
||||||
|
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||||
|
paddq xmm0, xmm6
|
||||||
|
paddq xmm1, xmm11
|
||||||
|
paddq xmm3, xmm8
|
||||||
|
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||||
|
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||||
|
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||||
|
|
||||||
|
add rdi, rdx
|
||||||
|
mov QWORD PTR [r13], rdi
|
||||||
|
xor rdi, r10
|
||||||
|
mov ecx, edi
|
||||||
|
and ecx, 131056
|
||||||
|
lea r8, QWORD PTR [rcx+rsi]
|
||||||
|
|
||||||
|
mov rdx, QWORD PTR [r13+8]
|
||||||
|
add rbp, rax
|
||||||
|
mov QWORD PTR [r13+8], rbp
|
||||||
|
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||||
|
xor rbp, rdx
|
||||||
|
mov r13, QWORD PTR [rsp]
|
||||||
|
movdqa xmm3, xmm7
|
||||||
|
mov rdx, QWORD PTR [rsp+8]
|
||||||
|
movdqa xmm8, xmm6
|
||||||
|
mov r10, QWORD PTR [rsp+256]
|
||||||
|
movdqa xmm7, xmm9
|
||||||
|
mov r11, QWORD PTR [rsp+264]
|
||||||
|
movdqa xmm6, xmm10
|
||||||
|
mov r9, r15
|
||||||
|
dec r14d
|
||||||
|
jne main_loop_double_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp+272]
|
||||||
|
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+184]
|
||||||
|
movaps xmm6, XMMWORD PTR [r11-24]
|
||||||
|
movaps xmm7, XMMWORD PTR [r11-40]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-56]
|
||||||
|
movaps xmm9, XMMWORD PTR [r11-72]
|
||||||
|
movaps xmm10, XMMWORD PTR [r11-88]
|
||||||
|
movaps xmm11, XMMWORD PTR [r11-104]
|
||||||
|
movaps xmm12, XMMWORD PTR [r11-120]
|
||||||
|
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||||
|
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
||||||
|
jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp
|
||||||
|
|
||||||
|
div_fix_1_ultralitev2_sandybridge:
|
||||||
|
dec rbx
|
||||||
|
add r11, rdx
|
||||||
|
jmp div_fix_1_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
div_fix_2_ultralitev2_sandybridge:
|
||||||
|
dec rdx
|
||||||
|
add r8, r9
|
||||||
|
jmp div_fix_2_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
sqrt_fix_1_ultralitev2_sandybridge:
|
||||||
|
movq r8, xmm3
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
dec r9
|
||||||
|
mov r11d, -1022
|
||||||
|
shl r11, 32
|
||||||
|
mov rax, r9
|
||||||
|
shr r9, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rdx, r9
|
||||||
|
sub rdx, rax
|
||||||
|
lea rdx, [rdx+r11+1]
|
||||||
|
add rax, r11
|
||||||
|
imul rdx, rax
|
||||||
|
sub rdx, r8
|
||||||
|
adc r9, 0
|
||||||
|
movq xmm5, r9
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
jmp sqrt_fix_1_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
sqrt_fix_2_ultralitev2_sandybridge:
|
||||||
|
psrldq xmm3, 8
|
||||||
|
movq r11, xmm3
|
||||||
|
dec r8
|
||||||
|
mov ebx, -1022
|
||||||
|
shl rbx, 32
|
||||||
|
mov rax, r8
|
||||||
|
shr r8, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rdx, r8
|
||||||
|
sub rdx, rax
|
||||||
|
lea rdx, [rdx+rbx+1]
|
||||||
|
add rax, rbx
|
||||||
|
imul rdx, rax
|
||||||
|
sub rdx, r11
|
||||||
|
adc r8, 0
|
||||||
|
movq xmm0, r8
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
jmp sqrt_fix_2_ret_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp:
|
180
src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc
Normal file
180
src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc
Normal file
|
@ -0,0 +1,180 @@
|
||||||
|
mov QWORD PTR [rsp+16], rbx
|
||||||
|
mov QWORD PTR [rsp+24], rbp
|
||||||
|
mov QWORD PTR [rsp+32], rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 64
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov ebp, 65536
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movd xmm3, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movd xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
mov rdi, QWORD PTR [r9+104]
|
||||||
|
and r10d, 131056
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm6
|
||||||
|
movd xmm4, rax
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm8
|
||||||
|
xorps xmm8, xmm8
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movd xmm7, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movd xmm0, rcx
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
|
||||||
|
ALIGN 16
|
||||||
|
cnv2_main_loop_ultralitev2_bulldozer:
|
||||||
|
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||||
|
movd xmm6, r8
|
||||||
|
pinsrq xmm6, r11, 1
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
lea r9, QWORD PTR [rdi+rdi]
|
||||||
|
shl rdi, 32
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
aesenc xmm5, xmm6
|
||||||
|
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
paddq xmm0, xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movaps xmm1, xmm8
|
||||||
|
mov rsi, r15
|
||||||
|
xor rsi, rdi
|
||||||
|
|
||||||
|
mov edi, 1023
|
||||||
|
shl rdi, 52
|
||||||
|
|
||||||
|
movd r14, xmm5
|
||||||
|
pextrq rax, xmm5, 1
|
||||||
|
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
mov r10, r14
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rsi, QWORD PTR [r10+rbx]
|
||||||
|
lea r12, QWORD PTR [r10+rbx]
|
||||||
|
mov r13, QWORD PTR [r10+rbx+8]
|
||||||
|
|
||||||
|
add r9d, r14d
|
||||||
|
or r9d, -2147483647
|
||||||
|
xor edx, edx
|
||||||
|
div r9
|
||||||
|
mov eax, eax
|
||||||
|
shl rdx, 32
|
||||||
|
lea r15, [rax+rdx]
|
||||||
|
lea rax, [r14+r15]
|
||||||
|
shr rax, 12
|
||||||
|
add rax, rdi
|
||||||
|
movd xmm0, rax
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movd rdi, xmm1
|
||||||
|
test rdi, 524287
|
||||||
|
je sqrt_fixup_ultralitev2_bulldozer
|
||||||
|
shr rdi, 19
|
||||||
|
|
||||||
|
sqrt_fixup_ultralitev2_bulldozer_ret:
|
||||||
|
mov rax, rsi
|
||||||
|
mul r14
|
||||||
|
movd xmm1, rax
|
||||||
|
movd xmm0, rdx
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
mov ecx, r10d
|
||||||
|
xor r9d, 16
|
||||||
|
xor ecx, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
pxor xmm2, xmm0
|
||||||
|
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
add r8, rdx
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r12], r8
|
||||||
|
xor r8, rsi
|
||||||
|
mov QWORD PTR [r12+8], r11
|
||||||
|
mov r10, r8
|
||||||
|
xor r11, r13
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa xmm3, xmm5
|
||||||
|
dec ebp
|
||||||
|
jne cnv2_main_loop_ultralitev2_bulldozer
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+64]
|
||||||
|
mov rbx, QWORD PTR [r11+56]
|
||||||
|
mov rbp, QWORD PTR [r11+64]
|
||||||
|
mov rsi, QWORD PTR [r11+72]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-48]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
jmp cnv2_main_loop_ultralitev2_bulldozer_endp
|
||||||
|
|
||||||
|
sqrt_fixup_ultralitev2_bulldozer:
|
||||||
|
movd r9, xmm5
|
||||||
|
add r9, r15
|
||||||
|
dec rdi
|
||||||
|
mov edx, -1022
|
||||||
|
shl rdx, 32
|
||||||
|
mov rax, rdi
|
||||||
|
shr rdi, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdi
|
||||||
|
sub rcx, rax
|
||||||
|
lea rcx, [rcx+rdx+1]
|
||||||
|
add rax, rdx
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdi, 0
|
||||||
|
jmp sqrt_fixup_ultralitev2_bulldozer_ret
|
||||||
|
|
||||||
|
cnv2_main_loop_ultralitev2_bulldozer_endp:
|
182
src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc
Normal file
182
src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc
Normal file
|
@ -0,0 +1,182 @@
|
||||||
|
mov QWORD PTR [rsp+24], rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 80
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov esi, 65536
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
mov r13d, -2147483647
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movq xmm4, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movq xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
movq xmm3, QWORD PTR [r9+104]
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm6
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm8
|
||||||
|
and r10d, 131056
|
||||||
|
movq xmm5, rax
|
||||||
|
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm8, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
$main_loop_ultralitev2_ivybridge:
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
mov rdi, r15
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movq xmm0, r11
|
||||||
|
movq xmm7, r8
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
aesenc xmm6, xmm7
|
||||||
|
movq rbp, xmm6
|
||||||
|
mov r9, rbp
|
||||||
|
and r9d, 131056
|
||||||
|
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
mov r10, r9
|
||||||
|
xor r10d, 32
|
||||||
|
movq rcx, xmm3
|
||||||
|
mov rax, rcx
|
||||||
|
shl rax, 32
|
||||||
|
xor rdi, rax
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
pxor xmm0, xmm4
|
||||||
|
movdqu XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rdi, QWORD PTR [r9+rbx]
|
||||||
|
lea r14, QWORD PTR [r9+rbx]
|
||||||
|
mov r12, QWORD PTR [r14+8]
|
||||||
|
xor edx, edx
|
||||||
|
lea r9d, DWORD PTR [ecx+ecx]
|
||||||
|
add r9d, ebp
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
psrldq xmm0, 8
|
||||||
|
or r9d, r13d
|
||||||
|
movq rax, xmm0
|
||||||
|
div r9
|
||||||
|
xorps xmm3, xmm3
|
||||||
|
mov eax, eax
|
||||||
|
shl rdx, 32
|
||||||
|
add rdx, rax
|
||||||
|
lea r9, QWORD PTR [rdx+rbp]
|
||||||
|
mov r15, rdx
|
||||||
|
mov rax, r9
|
||||||
|
shr rax, 12
|
||||||
|
movq xmm0, rax
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
sqrtsd xmm3, xmm0
|
||||||
|
movq rdx, xmm3
|
||||||
|
test edx, 524287
|
||||||
|
je $sqrt_fixup_ultralitev2_ivybridge
|
||||||
|
psrlq xmm3, 19
|
||||||
|
$sqrt_fixup_ultralitev2_ivybridge_ret:
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov rax, rdi
|
||||||
|
mul rbp
|
||||||
|
movq xmm2, rdx
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
add r8, rdx
|
||||||
|
mov QWORD PTR [r14], r8
|
||||||
|
xor r8, rdi
|
||||||
|
mov edi, r8d
|
||||||
|
and edi, 131056
|
||||||
|
movq xmm0, rax
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r14+8], r11
|
||||||
|
punpcklqdq xmm2, xmm0
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
xor r9d, 48
|
||||||
|
xor r10d, 16
|
||||||
|
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||||
|
movdqa xmm4, xmm6
|
||||||
|
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
movdqu xmm6, [rdi+rbx]
|
||||||
|
mov r10d, edi
|
||||||
|
xor r11, r12
|
||||||
|
dec rsi
|
||||||
|
jne $main_loop_ultralitev2_ivybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
mov rbx, QWORD PTR [rsp+160]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||||
|
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||||
|
add rsp, 80
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
jmp $cnv2_main_loop_ultralitev2_ivybridge_endp
|
||||||
|
|
||||||
|
$sqrt_fixup_ultralitev2_ivybridge:
|
||||||
|
dec rdx
|
||||||
|
mov r13d, -1022
|
||||||
|
shl r13, 32
|
||||||
|
mov rax, rdx
|
||||||
|
shr rdx, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
add rax, r13
|
||||||
|
not r13
|
||||||
|
sub rcx, r13
|
||||||
|
mov r13d, -2147483647
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdx, 0
|
||||||
|
movq xmm3, rdx
|
||||||
|
jmp $sqrt_fixup_ultralitev2_ivybridge_ret
|
||||||
|
|
||||||
|
$cnv2_main_loop_ultralitev2_ivybridge_endp:
|
179
src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc
Normal file
179
src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc
Normal file
|
@ -0,0 +1,179 @@
|
||||||
|
mov QWORD PTR [rsp+16], rbx
|
||||||
|
mov QWORD PTR [rsp+24], rbp
|
||||||
|
mov QWORD PTR [rsp+32], rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 64
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp]
|
||||||
|
mov DWORD PTR [rsp+4], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r9, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov ebp, 65536
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r11, QWORD PTR [rcx+40]
|
||||||
|
mov r10, r8
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
movq xmm3, rax
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
xor r11, QWORD PTR [rcx+8]
|
||||||
|
mov rbx, QWORD PTR [rcx+224]
|
||||||
|
mov rax, QWORD PTR [r9+80]
|
||||||
|
xor rax, QWORD PTR [r9+64]
|
||||||
|
movq xmm0, rdx
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r9+72]
|
||||||
|
mov rdi, QWORD PTR [r9+104]
|
||||||
|
and r10d, 131056
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm6
|
||||||
|
movq xmm4, rax
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm8
|
||||||
|
xorps xmm8, xmm8
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm7, rax
|
||||||
|
mov r15, QWORD PTR [r9+96]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
movq xmm0, rcx
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
$main_loop_ultralitev2_ryzen:
|
||||||
|
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||||
|
movq xmm0, r11
|
||||||
|
movq xmm6, r8
|
||||||
|
punpcklqdq xmm6, xmm0
|
||||||
|
lea rdx, QWORD PTR [r10+rbx]
|
||||||
|
lea r9, QWORD PTR [rdi+rdi]
|
||||||
|
shl rdi, 32
|
||||||
|
|
||||||
|
mov ecx, r10d
|
||||||
|
mov eax, r10d
|
||||||
|
xor ecx, 16
|
||||||
|
xor eax, 32
|
||||||
|
xor r10d, 48
|
||||||
|
aesenc xmm5, xmm6
|
||||||
|
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||||
|
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||||
|
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
paddq xmm0, xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||||
|
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movaps xmm1, xmm8
|
||||||
|
mov rsi, r15
|
||||||
|
xor rsi, rdi
|
||||||
|
movq r14, xmm5
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
pxor xmm0, xmm3
|
||||||
|
mov r10, r14
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa XMMWORD PTR [rdx], xmm0
|
||||||
|
xor rsi, QWORD PTR [r10+rbx]
|
||||||
|
lea r12, QWORD PTR [r10+rbx]
|
||||||
|
mov r13, QWORD PTR [r10+rbx+8]
|
||||||
|
|
||||||
|
add r9d, r14d
|
||||||
|
or r9d, -2147483647
|
||||||
|
xor edx, edx
|
||||||
|
movdqa xmm0, xmm5
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movq rax, xmm0
|
||||||
|
|
||||||
|
div r9
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm1, rdx
|
||||||
|
punpckldq xmm0, xmm1
|
||||||
|
movq r15, xmm0
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqa xmm2, xmm0
|
||||||
|
psrlq xmm0, 12
|
||||||
|
paddq xmm0, xmm7
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movq rdi, xmm1
|
||||||
|
test rdi, 524287
|
||||||
|
je $sqrt_fixup_ultralitev2_ryzen
|
||||||
|
shr rdi, 19
|
||||||
|
|
||||||
|
$sqrt_fixup_ultralitev2_ryzen_ret:
|
||||||
|
mov rax, rsi
|
||||||
|
mul r14
|
||||||
|
movq xmm1, rax
|
||||||
|
movq xmm0, rdx
|
||||||
|
punpcklqdq xmm0, xmm1
|
||||||
|
|
||||||
|
mov r9d, r10d
|
||||||
|
mov ecx, r10d
|
||||||
|
xor r9d, 16
|
||||||
|
xor ecx, 32
|
||||||
|
xor r10d, 48
|
||||||
|
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||||
|
xor rdx, [rcx+rbx]
|
||||||
|
xor rax, [rcx+rbx+8]
|
||||||
|
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||||
|
pxor xmm2, xmm0
|
||||||
|
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||||
|
paddq xmm2, xmm3
|
||||||
|
paddq xmm1, xmm6
|
||||||
|
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||||
|
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||||
|
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||||
|
|
||||||
|
movdqa xmm4, xmm3
|
||||||
|
add r8, rdx
|
||||||
|
add r11, rax
|
||||||
|
mov QWORD PTR [r12], r8
|
||||||
|
xor r8, rsi
|
||||||
|
mov QWORD PTR [r12+8], r11
|
||||||
|
mov r10, r8
|
||||||
|
xor r11, r13
|
||||||
|
and r10d, 131056
|
||||||
|
movdqa xmm3, xmm5
|
||||||
|
dec ebp
|
||||||
|
jne $main_loop_ultralitev2_ryzen
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||||
|
lea r11, QWORD PTR [rsp+64]
|
||||||
|
mov rbx, QWORD PTR [r11+56]
|
||||||
|
mov rbp, QWORD PTR [r11+64]
|
||||||
|
mov rsi, QWORD PTR [r11+72]
|
||||||
|
movaps xmm8, XMMWORD PTR [r11-48]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||||
|
mov rsp, r11
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
jmp $cnv2_main_loop_ultralitev2_ryzen_endp
|
||||||
|
|
||||||
|
$sqrt_fixup_ultralitev2_ryzen:
|
||||||
|
movq r9, xmm2
|
||||||
|
dec rdi
|
||||||
|
mov edx, -1022
|
||||||
|
shl rdx, 32
|
||||||
|
mov rax, rdi
|
||||||
|
shr rdi, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdi
|
||||||
|
sub rcx, rax
|
||||||
|
lea rcx, [rcx+rdx+1]
|
||||||
|
add rax, rdx
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdi, 0
|
||||||
|
jmp $sqrt_fixup_ultralitev2_ryzen_ret
|
||||||
|
|
||||||
|
$cnv2_main_loop_ultralitev2_ryzen_endp:
|
|
@ -0,0 +1,267 @@
|
||||||
|
mov QWORD PTR [rsp+8], rcx
|
||||||
|
push rbx
|
||||||
|
push rbp
|
||||||
|
push rsi
|
||||||
|
push rdi
|
||||||
|
push r12
|
||||||
|
push r13
|
||||||
|
push r14
|
||||||
|
push r15
|
||||||
|
sub rsp, 152
|
||||||
|
|
||||||
|
stmxcsr DWORD PTR [rsp+4]
|
||||||
|
mov DWORD PTR [rsp], 24448
|
||||||
|
ldmxcsr DWORD PTR [rsp]
|
||||||
|
|
||||||
|
mov rax, QWORD PTR [rcx+48]
|
||||||
|
mov r10, rcx
|
||||||
|
xor rax, QWORD PTR [rcx+16]
|
||||||
|
mov r8, QWORD PTR [rcx+32]
|
||||||
|
xor r8, QWORD PTR [rcx]
|
||||||
|
mov r9, QWORD PTR [rcx+40]
|
||||||
|
xor r9, QWORD PTR [rcx+8]
|
||||||
|
movq xmm4, rax
|
||||||
|
mov rdx, QWORD PTR [rcx+56]
|
||||||
|
xor rdx, QWORD PTR [rcx+24]
|
||||||
|
mov r11, QWORD PTR [rcx+224]
|
||||||
|
mov rcx, QWORD PTR [rcx+88]
|
||||||
|
xor rcx, QWORD PTR [r10+72]
|
||||||
|
mov rax, QWORD PTR [r10+80]
|
||||||
|
movq xmm0, rdx
|
||||||
|
xor rax, QWORD PTR [r10+64]
|
||||||
|
|
||||||
|
movaps XMMWORD PTR [rsp+16], xmm6
|
||||||
|
movaps XMMWORD PTR [rsp+32], xmm7
|
||||||
|
movaps XMMWORD PTR [rsp+48], xmm8
|
||||||
|
movaps XMMWORD PTR [rsp+64], xmm9
|
||||||
|
movaps XMMWORD PTR [rsp+80], xmm10
|
||||||
|
movaps XMMWORD PTR [rsp+96], xmm11
|
||||||
|
movaps XMMWORD PTR [rsp+112], xmm12
|
||||||
|
movaps XMMWORD PTR [rsp+128], xmm13
|
||||||
|
|
||||||
|
movq xmm5, rax
|
||||||
|
|
||||||
|
mov ax, 1023
|
||||||
|
shl rax, 52
|
||||||
|
movq xmm8, rax
|
||||||
|
|
||||||
|
mov rax, r8
|
||||||
|
punpcklqdq xmm4, xmm0
|
||||||
|
and eax, 131056
|
||||||
|
movq xmm10, QWORD PTR [r10+96]
|
||||||
|
movq xmm0, rcx
|
||||||
|
mov rcx, QWORD PTR [r10+104]
|
||||||
|
xorps xmm9, xmm9
|
||||||
|
mov QWORD PTR [rsp+248], rax
|
||||||
|
movq xmm12, r11
|
||||||
|
mov QWORD PTR [rsp+240], r9
|
||||||
|
punpcklqdq xmm5, xmm0
|
||||||
|
movq xmm13, rcx
|
||||||
|
mov r12d, 65536
|
||||||
|
|
||||||
|
ALIGN 64
|
||||||
|
cnv2_mainloop_soft_aes_ultralitev2_sandybridge:
|
||||||
|
movd xmm11, r12d
|
||||||
|
mov r12, QWORD PTR [r10+272]
|
||||||
|
lea r13, QWORD PTR [rax+r11]
|
||||||
|
mov esi, DWORD PTR [r13]
|
||||||
|
movq xmm0, r9
|
||||||
|
mov r10d, DWORD PTR [r13+4]
|
||||||
|
movq xmm7, r8
|
||||||
|
mov ebp, DWORD PTR [r13+12]
|
||||||
|
mov r14d, DWORD PTR [r13+8]
|
||||||
|
mov rdx, QWORD PTR [rsp+248]
|
||||||
|
movzx ecx, sil
|
||||||
|
shr esi, 8
|
||||||
|
punpcklqdq xmm7, xmm0
|
||||||
|
mov r15d, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, r10b
|
||||||
|
shr r10d, 8
|
||||||
|
mov edi, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, r14b
|
||||||
|
shr r14d, 8
|
||||||
|
mov ebx, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, bpl
|
||||||
|
shr ebp, 8
|
||||||
|
mov r9d, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, r10b
|
||||||
|
shr r10d, 8
|
||||||
|
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
movzx ecx, r14b
|
||||||
|
shr r14d, 8
|
||||||
|
mov eax, r14d
|
||||||
|
shr eax, 8
|
||||||
|
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
add eax, 256
|
||||||
|
movzx ecx, bpl
|
||||||
|
shr ebp, 8
|
||||||
|
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
movzx ecx, sil
|
||||||
|
shr esi, 8
|
||||||
|
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||||
|
add r12, 2048
|
||||||
|
movzx ecx, r10b
|
||||||
|
shr r10d, 8
|
||||||
|
add r10d, 256
|
||||||
|
mov r11d, DWORD PTR [r12+rax*4]
|
||||||
|
xor r11d, DWORD PTR [r12+rcx*4]
|
||||||
|
xor r11d, r9d
|
||||||
|
movzx ecx, sil
|
||||||
|
mov r10d, DWORD PTR [r12+r10*4]
|
||||||
|
shr esi, 8
|
||||||
|
add esi, 256
|
||||||
|
xor r10d, DWORD PTR [r12+rcx*4]
|
||||||
|
movzx ecx, bpl
|
||||||
|
xor r10d, ebx
|
||||||
|
shr ebp, 8
|
||||||
|
movd xmm1, r11d
|
||||||
|
add ebp, 256
|
||||||
|
movq r11, xmm12
|
||||||
|
mov r9d, DWORD PTR [r12+rcx*4]
|
||||||
|
xor r9d, DWORD PTR [r12+rsi*4]
|
||||||
|
mov eax, DWORD PTR [r12+rbp*4]
|
||||||
|
xor r9d, edi
|
||||||
|
movzx ecx, r14b
|
||||||
|
movd xmm0, r10d
|
||||||
|
movd xmm2, r9d
|
||||||
|
xor eax, DWORD PTR [r12+rcx*4]
|
||||||
|
mov rcx, rdx
|
||||||
|
xor eax, r15d
|
||||||
|
punpckldq xmm2, xmm1
|
||||||
|
xor rcx, 16
|
||||||
|
movd xmm6, eax
|
||||||
|
mov rax, rdx
|
||||||
|
punpckldq xmm6, xmm0
|
||||||
|
xor rax, 32
|
||||||
|
punpckldq xmm6, xmm2
|
||||||
|
xor rdx, 48
|
||||||
|
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||||
|
pxor xmm6, xmm7
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||||
|
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||||
|
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||||
|
movq rcx, xmm13
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||||
|
movq rdi, xmm6
|
||||||
|
mov r10, rdi
|
||||||
|
and r10d, 131056
|
||||||
|
xor edx, edx
|
||||||
|
mov rax, rcx
|
||||||
|
shl rax, 32
|
||||||
|
movq rbx, xmm10
|
||||||
|
xor rbx, rax
|
||||||
|
lea r9, QWORD PTR [rcx+rcx]
|
||||||
|
add r9d, edi
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
pxor xmm0, xmm4
|
||||||
|
mov ecx, -2147483647
|
||||||
|
movdqu XMMWORD PTR [r13], xmm0
|
||||||
|
or r9, rcx
|
||||||
|
movdqa xmm0, xmm6
|
||||||
|
movaps xmm1, xmm9
|
||||||
|
psrldq xmm0, 8
|
||||||
|
movq rax, xmm0
|
||||||
|
xor rbx, QWORD PTR [r10+r11]
|
||||||
|
lea r14, QWORD PTR [r10+r11]
|
||||||
|
mov rbp, QWORD PTR [r14+8]
|
||||||
|
div r9
|
||||||
|
shl rdx, 32
|
||||||
|
mov eax, eax
|
||||||
|
add rdx, rax
|
||||||
|
lea r9, QWORD PTR [rdx+rdi]
|
||||||
|
movq xmm10, rdx
|
||||||
|
mov rax, r9
|
||||||
|
shr rax, 12
|
||||||
|
movq xmm0, rax
|
||||||
|
paddq xmm0, xmm8
|
||||||
|
sqrtsd xmm1, xmm0
|
||||||
|
movq rdx, xmm1
|
||||||
|
test rdx, 524287
|
||||||
|
je sqrt_fixup_soft_aes_ultralitev2_sandybridge
|
||||||
|
psrlq xmm1, 19
|
||||||
|
sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret:
|
||||||
|
|
||||||
|
mov r9, r10
|
||||||
|
movdqa xmm13, xmm1
|
||||||
|
xor r9, 16
|
||||||
|
mov rcx, r10
|
||||||
|
xor rcx, 32
|
||||||
|
xor r10, 48
|
||||||
|
mov rax, rbx
|
||||||
|
mul rdi
|
||||||
|
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||||
|
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||||
|
paddq xmm1, xmm7
|
||||||
|
movq xmm0, rax
|
||||||
|
movq xmm3, rdx
|
||||||
|
xor rax, QWORD PTR [r11+rcx+8]
|
||||||
|
xor rdx, QWORD PTR [rcx+r11]
|
||||||
|
punpcklqdq xmm3, xmm0
|
||||||
|
add r8, rdx
|
||||||
|
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||||
|
pxor xmm2, xmm3
|
||||||
|
paddq xmm0, xmm5
|
||||||
|
paddq xmm2, xmm4
|
||||||
|
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||||
|
movdqa xmm5, xmm4
|
||||||
|
mov r9, QWORD PTR [rsp+240]
|
||||||
|
movdqa xmm4, xmm6
|
||||||
|
add r9, rax
|
||||||
|
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||||
|
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||||
|
mov r10, QWORD PTR [rsp+224]
|
||||||
|
movd r12d, xmm11
|
||||||
|
mov QWORD PTR [r14], r8
|
||||||
|
xor r8, rbx
|
||||||
|
mov rax, r8
|
||||||
|
mov QWORD PTR [r14+8], r9
|
||||||
|
and eax, 131056
|
||||||
|
xor r9, rbp
|
||||||
|
mov QWORD PTR [rsp+240], r9
|
||||||
|
mov QWORD PTR [rsp+248], rax
|
||||||
|
sub r12d, 1
|
||||||
|
jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge
|
||||||
|
|
||||||
|
ldmxcsr DWORD PTR [rsp+4]
|
||||||
|
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||||
|
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||||
|
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||||
|
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||||
|
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||||
|
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||||
|
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||||
|
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||||
|
|
||||||
|
add rsp, 152
|
||||||
|
pop r15
|
||||||
|
pop r14
|
||||||
|
pop r13
|
||||||
|
pop r12
|
||||||
|
pop rdi
|
||||||
|
pop rsi
|
||||||
|
pop rbp
|
||||||
|
pop rbx
|
||||||
|
jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp
|
||||||
|
|
||||||
|
sqrt_fixup_soft_aes_ultralitev2_sandybridge:
|
||||||
|
dec rdx
|
||||||
|
mov r15d, -1022
|
||||||
|
shl r15, 32
|
||||||
|
mov rax, rdx
|
||||||
|
shr rdx, 19
|
||||||
|
shr rax, 20
|
||||||
|
mov rcx, rdx
|
||||||
|
sub rcx, rax
|
||||||
|
lea rcx, [rcx+r15+1]
|
||||||
|
add rax, r15
|
||||||
|
imul rcx, rax
|
||||||
|
sub rcx, r9
|
||||||
|
adc rdx, 0
|
||||||
|
movq xmm1, rdx
|
||||||
|
jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret
|
||||||
|
|
||||||
|
cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp:
|
|
@ -1,10 +1,10 @@
|
||||||
{
|
{
|
||||||
"algo": "cryptonight", // cryptonight (default), cryptonight-lite or cryptonight-heavy
|
"algo": "cryptonight", // cryptonight (default), cryptonight-lite, cryptonight-ultralite or cryptonight-heavy
|
||||||
"aesni": 0, // selection of AES-NI mode (0 auto, 1 on, 2 off)
|
"aesni": 0, // selection of AES-NI mode (0 auto, 1 on, 2 off)
|
||||||
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
|
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
|
||||||
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
|
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
|
||||||
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
|
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
|
||||||
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx
|
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for > v5), msr, xhv, rto, xfh, upx, turtle
|
||||||
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
|
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
|
||||||
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
|
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
|
||||||
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
|
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
|
||||||
|
|
|
@ -138,6 +138,10 @@ bool Job::setTarget(const char *target)
|
||||||
|
|
||||||
PowVariant Job::powVariant() const
|
PowVariant Job::powVariant() const
|
||||||
{
|
{
|
||||||
|
if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
|
||||||
|
return PowVariant::POW_V2;
|
||||||
|
}
|
||||||
|
|
||||||
if (m_powVariant == PowVariant::POW_AUTODETECT) {
|
if (m_powVariant == PowVariant::POW_AUTODETECT) {
|
||||||
if (m_blob[0] > 7) {
|
if (m_blob[0] > 7) {
|
||||||
return PowVariant::POW_V2;
|
return PowVariant::POW_V2;
|
||||||
|
|
|
@ -60,6 +60,8 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) :
|
||||||
url = new Url("donate2.graef.in", 8443, userId, nullptr, true, false, true);
|
url = new Url("donate2.graef.in", 8443, userId, nullptr, true, false, true);
|
||||||
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) {
|
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) {
|
||||||
url = new Url("donate2.graef.in", 1080, userId, nullptr, true, false, true);
|
url = new Url("donate2.graef.in", 1080, userId, nullptr, true, false, true);
|
||||||
|
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
|
||||||
|
url = new Url("donate2.graef.in", 8090, userId, nullptr, true, false, true);
|
||||||
} else {
|
} else {
|
||||||
url = new Url("donate2.graef.in", 443, userId, nullptr, true, false, true);
|
url = new Url("donate2.graef.in", 443, userId, nullptr, true, false, true);
|
||||||
}
|
}
|
||||||
|
@ -68,6 +70,8 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) :
|
||||||
url = new Url("donate.graef.in", 8443, userId, nullptr, false, false, true);
|
url = new Url("donate.graef.in", 8443, userId, nullptr, false, false, true);
|
||||||
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) {
|
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) {
|
||||||
url = new Url("donate.graef.in", 1080, userId, nullptr, false, false, true);
|
url = new Url("donate.graef.in", 1080, userId, nullptr, false, false, true);
|
||||||
|
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
|
||||||
|
url = new Url("donate2.graef.in", 8088, userId, nullptr, false, false, true);
|
||||||
} else {
|
} else {
|
||||||
url = new Url("donate2.graef.in", 80, userId, nullptr, false, false, true);
|
url = new Url("donate2.graef.in", 80, userId, nullptr, false, false, true);
|
||||||
}
|
}
|
||||||
|
|
|
@ -36,14 +36,14 @@
|
||||||
#define APP_DESC "XMRigCC CPU miner"
|
#define APP_DESC "XMRigCC CPU miner"
|
||||||
#define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id"
|
#define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id"
|
||||||
#endif
|
#endif
|
||||||
#define APP_VERSION "1.8.8 (based on XMRig)"
|
#define APP_VERSION "1.8.9 (based on XMRig)"
|
||||||
#define APP_DOMAIN ""
|
#define APP_DOMAIN ""
|
||||||
#define APP_SITE "https://github.com/Bendr0id/xmrigCC"
|
#define APP_SITE "https://github.com/Bendr0id/xmrigCC"
|
||||||
#define APP_KIND "cpu"
|
#define APP_KIND "cpu"
|
||||||
|
|
||||||
#define APP_VER_MAJOR 1
|
#define APP_VER_MAJOR 1
|
||||||
#define APP_VER_MINOR 8
|
#define APP_VER_MINOR 8
|
||||||
#define APP_VER_BUILD 8
|
#define APP_VER_BUILD 9
|
||||||
#define APP_VER_REV 0
|
#define APP_VER_REV 0
|
||||||
|
|
||||||
#ifndef NDEBUG
|
#ifndef NDEBUG
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue