Add ASM optimizations (#208)

* Add ASM optimizations

- Add ASM optimization for CN-FAST on INTEL
- Add ASM optimization for CNV2 on AMD Bulldozer
- Alloy is now announced as XAO

* Upgraded default configs
This commit is contained in:
Ben Gräf 2018-11-12 12:20:59 +01:00 committed by GitHub
parent eebf62cd6a
commit 8997e74b90
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
18 changed files with 992 additions and 46 deletions

View file

@ -26,6 +26,7 @@ enum AsmOptimization
ASM_AUTODETECT, ASM_AUTODETECT,
ASM_INTEL, ASM_INTEL,
ASM_RYZEN, ASM_RYZEN,
ASM_BULLDOZER,
ASM_OFF ASM_OFF
}; };
@ -37,6 +38,8 @@ inline std::string getAsmOptimizationName(AsmOptimization asmOptimization)
return "INTEL"; return "INTEL";
case ASM_RYZEN: case ASM_RYZEN:
return "RYZEN"; return "RYZEN";
case ASM_BULLDOZER:
return "BULLDOZER";
case ASM_OFF: case ASM_OFF:
return "OFF"; return "OFF";
case ASM_AUTODETECT: case ASM_AUTODETECT:
@ -62,7 +65,11 @@ inline AsmOptimization parseAsmOptimization(int optimization)
case 2: case 2:
asmOptimization = AsmOptimization::ASM_RYZEN; asmOptimization = AsmOptimization::ASM_RYZEN;
break; break;
case 3:
asmOptimization = AsmOptimization::ASM_AUTODETECT;
break;
default: default:
asmOptimization = AsmOptimization::ASM_AUTODETECT;
break; break;
} }
@ -79,6 +86,8 @@ inline AsmOptimization parseAsmOptimization(const std::string optimization)
asmOptimization = AsmOptimization::ASM_INTEL; asmOptimization = AsmOptimization::ASM_INTEL;
} else if (optimization == "2" || optimization == "ryzen") { } else if (optimization == "2" || optimization == "ryzen") {
asmOptimization = AsmOptimization::ASM_RYZEN; asmOptimization = AsmOptimization::ASM_RYZEN;
} else if (optimization == "3" || optimization == "bulldozer") {
asmOptimization = AsmOptimization::ASM_RYZEN;
} }
return asmOptimization; return asmOptimization;

View file

@ -82,8 +82,12 @@ void CpuImpl::initCommon()
} }
# ifndef XMRIG_NO_ASM # ifndef XMRIG_NO_ASM
if (data.vendor == VENDOR_AMD && data.ext_family >= 0x17) { if (data.vendor == VENDOR_AMD) {
m_asmOptimization = AsmOptimization::ASM_RYZEN; if (data.ext_family >= 0x17) {
m_asmOptimization = AsmOptimization::ASM_RYZEN;
} else if (data.ext_family >= 0x15) {
m_asmOptimization = AsmOptimization::ASM_BULLDOZER;
}
} else if (data.vendor == VENDOR_INTEL && } else if (data.vendor == VENDOR_INTEL &&
((data.ext_family >= 0x06 && data.ext_model > 0x2) || ((data.ext_family >= 0x06 && data.ext_model > 0x2) ||
(data.ext_family >= 0x06 && data.ext_model == 0x2 && data.model >= 0xA))) { (data.ext_family >= 0x06 && data.ext_model == 0x2 && data.model >= 0xA))) {

View file

@ -73,9 +73,9 @@ Options:\n"
-k, --keepalive send keepalived for prevent timeout (need pool support)\n\ -k, --keepalive send keepalived for prevent timeout (need pool support)\n\
-r, --retries=N number of times to retry before switch to backup server (default: 5)\n\ -r, --retries=N number of times to retry before switch to backup server (default: 5)\n\
-R, --retry-pause=N time to pause between retries (default: 5)\n\ -R, --retry-pause=N time to pause between retries (default: 5)\n\
--pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'alloy', 'xtl' (including autodetect for v5)\n\ --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for v5)\n\
for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\ for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\
--asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'off' \n\ --asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\
--multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\ --multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\
--multihash-thread-mask=MASK limits multihash to given threads (mask), (default: all threads)\n\ --multihash-thread-mask=MASK limits multihash to given threads (mask), (default: all threads)\n\
--cpu-affinity set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\ --cpu-affinity set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\
@ -287,7 +287,7 @@ constexpr static const char *pow_variant_names[] = {
"1", "1",
"2", "2",
"tube", "tube",
"alloy", "xao",
"xtl", "xtl",
"msr", "msr",
"xhv", "xhv",
@ -298,6 +298,7 @@ constexpr static const char *asm_optimization_names[] = {
"auto", "auto",
"intel", "intel",
"ryzen", "ryzen",
"bulldozer",
"off" "off"
}; };
@ -1049,6 +1050,11 @@ bool Options::parsePowVariant(const char *powVariant)
break; break;
} }
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "xao") || !strcmp(powVariant, "alloy"))) {
m_powVariant = POW_ALLOY;
break;
}
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "ipbc") || !strcmp(powVariant, "bittube"))) { if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "ipbc") || !strcmp(powVariant, "bittube"))) {
m_powVariant = POW_TUBE; m_powVariant = POW_TUBE;
break; break;

View file

@ -50,7 +50,7 @@ inline std::string getPowVariantName(PowVariant powVariant)
case POW_TUBE: case POW_TUBE:
return "tube"; return "tube";
case POW_ALLOY: case POW_ALLOY:
return "alloy"; return "xao";
case POW_XTL: case POW_XTL:
return "xtl"; return "xtl";
case POW_MSR: case POW_MSR:

View file

@ -4,9 +4,9 @@
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for v5), msr, xhv, rto
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, off "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
"colors": true, // false to disable colored output "colors": true, // false to disable colored output
"cpu-affinity": null, // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1 "cpu-affinity": null, // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1

View file

@ -50,14 +50,16 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
#if defined(XMRIG_ARM) #if defined(XMRIG_ARM)
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
#else #else
if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1)) { if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) ||
(asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) ||
(asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) {
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
} else { } else {
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
} }
#endif #endif
} else if (powVersion == PowVariant::POW_ALLOY) { } else if (powVersion == PowVariant::POW_ALLOY) {
CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
} else if (powVersion == PowVariant::POW_XTL) { } else if (powVersion == PowVariant::POW_XTL) {
#if defined(XMRIG_ARM) #if defined(XMRIG_ARM)
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
@ -69,7 +71,11 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
} }
#endif #endif
} else if (powVersion == PowVariant::POW_MSR) { } else if (powVersion == PowVariant::POW_MSR) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
} else {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
}
} else if (powVersion == PowVariant::POW_RTO) { } else if (powVersion == PowVariant::POW_RTO) {
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
}else { }else {
@ -113,7 +119,11 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV
} }
#endif #endif
} else if (powVersion == PowVariant::POW_MSR) { } else if (powVersion == PowVariant::POW_MSR) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
} else {
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
}
} else if (powVersion == PowVariant::POW_RTO) { } else if (powVersion == PowVariant::POW_RTO) {
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
} else { } else {
@ -464,25 +474,15 @@ bool CryptoNight::selfTest(int algo)
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads); cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_xtl, 32) == 0; result = result && memcmp(output, test_output_xtl, 32) == 0;
#if MAX_NUM_HASH_BLOCKS > 1 // cnv7 + msr aka cn-fast
cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_xtl, 64) == 0;
#endif
#if MAX_NUM_HASH_BLOCKS > 2 cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_MSR,test_input, 76, output, scratchPads);
cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_msr, 32) == 0;
result = result && memcmp(output, test_output_xtl, 96) == 0;
#endif
#if MAX_NUM_HASH_BLOCKS > 3 // cnv7 + alloy
cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_xtl, 128) == 0;
#endif
#if MAX_NUM_HASH_BLOCKS > 4 cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_ALLOY,test_input, 76, output, scratchPads);
cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_alloy, 32) == 0;
result = result && memcmp(output, test_output_xtl, 160) == 0;
#endif
// cn v8 aka cnv2 // cn v8 aka cnv2

View file

@ -99,20 +99,24 @@ const static uint8_t test_output_v2[160] = {
}; };
// CN XTL // CN XTL
const static uint8_t test_output_xtl[160] = { const static uint8_t test_output_xtl[32] = {
0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3, 0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3,
0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1, 0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1
0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90,
0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF,
0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3,
0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86,
0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D,
0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF,
0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA,
0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48
}; };
// CN MSR
const static uint8_t test_output_msr[32] = {
0x3C, 0x7A, 0x61, 0x08, 0x4C, 0x5E, 0xB8, 0x65, 0xB4, 0x98, 0xAB, 0x2F, 0x5A, 0x1A, 0xC5, 0x2C,
0x49, 0xC1, 0x77, 0xC2, 0xD0, 0x13, 0x34, 0x42, 0xD6, 0x5E, 0xD5, 0x14, 0x33, 0x5C, 0x82, 0xC5
};
// CN ALLOY
const static uint8_t test_output_alloy[32] = {
0x9A, 0x29, 0xD0, 0xC4, 0xAF, 0xDC, 0x63, 0x9B, 0x65, 0x53, 0xB1, 0xC8, 0x37, 0x35, 0x11, 0x4C,
0x5D, 0x77, 0x16, 0x21, 0x42, 0x97, 0x5C, 0xB8, 0x50, 0xC0, 0xA5, 0x1F, 0x64, 0x07, 0xBD, 0x33
};
// CN-LITE
const static uint8_t test_output_v0_lite[160] = { const static uint8_t test_output_v0_lite[160] = {
0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,

View file

@ -52,10 +52,13 @@ extern "C"
#ifndef XMRIG_NO_ASM #ifndef XMRIG_NO_ASM
void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0); void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0);
void cn_litev1_mainloop_sandybridge_asm(ScratchPad* ctx0); void cn_litev1_mainloop_sandybridge_asm(ScratchPad* ctx0);
void cn_fast_mainloop_sandybridge_asm(ScratchPad* ctx0);
void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0); void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0);
void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0); void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0);
void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
#endif #endif
@ -1424,14 +1427,22 @@ public:
if (ITERATIONS == 0x80000) { if (ITERATIONS == 0x80000) {
cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
} else if (ITERATIONS == 0x40000){ } else if (ITERATIONS == 0x40000) {
cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); if (MASK == 0x1FFFF0) {
cn_fast_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
} else {
cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
}
} }
} else { } else {
if (ITERATIONS == 0x80000) { if (ITERATIONS == 0x80000) {
cnv1_mainloop_sandybridge_asm(scratchPad[0]); cnv1_mainloop_sandybridge_asm(scratchPad[0]);
} else if (ITERATIONS == 0x40000){ } else if (ITERATIONS == 0x40000) {
cn_litev1_mainloop_sandybridge_asm(scratchPad[0]); if (MASK == 0x1FFFF0) {
cn_fast_mainloop_sandybridge_asm(scratchPad[0]);
} else {
cn_litev1_mainloop_sandybridge_asm(scratchPad[0]);
}
} }
} }
#endif #endif
@ -1538,6 +1549,8 @@ public:
} }
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) { } else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
cnv2_mainloop_ryzen_asm(scratchPad[0]); cnv2_mainloop_ryzen_asm(scratchPad[0]);
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
cnv2_mainloop_bulldozer_asm(scratchPad[0]);
} }
#endif #endif

View file

@ -0,0 +1,74 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 262144
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 2097136
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cn_fast_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 2097136
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 2097136
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cnv1_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -0,0 +1,166 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 2097136
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 262144
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cn_fast_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 2097136
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 2097136
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cnv1_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -9,12 +9,15 @@
#endif #endif
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
#ifdef __APPLE__ #ifdef __APPLE__
@ -41,6 +44,18 @@ FN_PREFIX(cn_litev1_mainloop_sandybridge_asm):
add rsp, 48 add rsp, 48
ret 0 ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cn_fast_mainloop_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cn_fast_mainloop_sandybridge.inc"
add rsp, 48
ret 0
#ifdef __APPLE__ #ifdef __APPLE__
ALIGN 16 ALIGN 16
#else #else
@ -65,6 +80,18 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm):
add rsp, 48 add rsp, 48
ret 0 ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_bulldozer.inc"
add rsp, 48
ret 0
#ifdef __APPLE__ #ifdef __APPLE__
ALIGN 16 ALIGN 16
#else #else
@ -102,6 +129,18 @@ FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm):
add rsp, 48 add rsp, 48
ret 0 ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cn_fast_mainloop_soft_aes_sandybridge.inc"
add rsp, 48
ret 0
#ifdef __APPLE__ #ifdef __APPLE__
ALIGN 16 ALIGN 16
#else #else

View file

@ -0,0 +1,180 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
cnv2_main_loop_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movq r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movq xmm0, rax
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_bulldozer
shr rdi, 19
sqrt_fixup_bulldozer_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_bulldozer_endp
sqrt_fixup_bulldozer:
movq r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_bulldozer_ret
cnv2_main_loop_bulldozer_endp:

View file

@ -0,0 +1,70 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 262144
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 2097136
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
ALIGN 64
cn_fast_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 2097136
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 2097136
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cnv1_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -0,0 +1,162 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 2097136
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 262144
ALIGN 64
cn_fast_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 2097136
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 2097136
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cnv1_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -1,12 +1,15 @@
_TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE _TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv1_mainloop_sandybridge_asm PUBLIC cnv1_mainloop_sandybridge_asm
PUBLIC cn_litev1_mainloop_sandybridge_asm PUBLIC cn_litev1_mainloop_sandybridge_asm
PUBLIC cn_fast_mainloop_sandybridge_asm
PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm PUBLIC cnv2_double_mainloop_sandybridge_asm
PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm
PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm
PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm
PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm
ALIGN 64 ALIGN 64
@ -21,6 +24,12 @@ cn_litev1_mainloop_sandybridge_asm PROC
ret 0 ret 0
cn_litev1_mainloop_sandybridge_asm ENDP cn_litev1_mainloop_sandybridge_asm ENDP
ALIGN 64
cn_fast_mainloop_sandybridge_asm PROC
INCLUDE cn_fast_mainloop_sandybridge.inc
ret 0
cn_fast_mainloop_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cnv2_mainloop_ivybridge_asm PROC cnv2_mainloop_ivybridge_asm PROC
INCLUDE cnv2_main_loop_ivybridge.inc INCLUDE cnv2_main_loop_ivybridge.inc
@ -33,6 +42,12 @@ cnv2_mainloop_ryzen_asm PROC
ret 0 ret 0
cnv2_mainloop_ryzen_asm ENDP cnv2_mainloop_ryzen_asm ENDP
ALIGN 64
cnv2_mainloop_bulldozer_asm PROC
INCLUDE cnv2_main_loop_bulldozer.inc
ret 0
cnv2_mainloop_bulldozer_asm ENDP
ALIGN 64 ALIGN 64
cnv2_double_mainloop_sandybridge_asm PROC cnv2_double_mainloop_sandybridge_asm PROC
INCLUDE cnv2_double_main_loop_sandybridge.inc INCLUDE cnv2_double_main_loop_sandybridge.inc
@ -51,6 +66,12 @@ cn_litev1_mainloop_soft_aes_sandybridge_asm PROC
ret 0 ret 0
cn_litev1_mainloop_soft_aes_sandybridge_asm ENDP cn_litev1_mainloop_soft_aes_sandybridge_asm ENDP
ALIGN 64
cn_fast_mainloop_soft_aes_sandybridge_asm PROC
INCLUDE cn_fast_mainloop_soft_aes_sandybridge.inc
ret 0
cn_fast_mainloop_soft_aes_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cnv2_mainloop_soft_aes_sandybridge_asm PROC cnv2_mainloop_soft_aes_sandybridge_asm PROC
INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc

View file

@ -5,12 +5,15 @@
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
ALIGN 64 ALIGN 64
@ -23,6 +26,11 @@ FN_PREFIX(cn_litev1_mainloop_sandybridge_asm):
#include "../cn_litev1_mainloop_sandybridge.inc" #include "../cn_litev1_mainloop_sandybridge.inc"
ret 0 ret 0
ALIGN 64
FN_PREFIX(cn_fast_mainloop_sandybridge_asm):
#include "../cn_fast_mainloop_sandybridge.inc"
ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_mainloop_ivybridge_asm): FN_PREFIX(cnv2_mainloop_ivybridge_asm):
#include "../cnv2_main_loop_ivybridge.inc" #include "../cnv2_main_loop_ivybridge.inc"
@ -33,6 +41,11 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm):
#include "../cnv2_main_loop_ryzen.inc" #include "../cnv2_main_loop_ryzen.inc"
ret 0 ret 0
ALIGN 64
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
#include "../cnv2_main_loop_bulldozer.inc"
ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
#include "../cnv2_double_main_loop_sandybridge.inc" #include "../cnv2_double_main_loop_sandybridge.inc"
@ -48,6 +61,11 @@ FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm):
#include "../cn_litev1_mainloop_soft_aes_sandybridge.inc" #include "../cn_litev1_mainloop_soft_aes_sandybridge.inc"
ret 0 ret 0
ALIGN 64
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm):
#include "../cn_fast_mainloop_soft_aes_sandybridge.inc"
ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
#include "../cnv2_mainloop_soft_aes_sandybridge.inc" #include "../cnv2_mainloop_soft_aes_sandybridge.inc"

View file

@ -0,0 +1,180 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 524288
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movd xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movd xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movd xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
cnv2_main_loop_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movd xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movd r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movd xmm0, rax
sqrtsd xmm1, xmm0
movd rdi, xmm1
test rdi, 524287
je sqrt_fixup_bulldozer
shr rdi, 19
sqrt_fixup_bulldozer_ret:
mov rax, rsi
mul r14
movd xmm1, rax
movd xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_bulldozer_endp
sqrt_fixup_bulldozer:
movd r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_bulldozer_ret
cnv2_main_loop_bulldozer_endp:

View file

@ -4,9 +4,9 @@
"threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count) "threads": 0, // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
"multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks) "multihash-factor": 0, // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
"multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads) "multihash-thread-mask" : null, // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
"pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto "pow-variant" : "auto", // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy (xao), xtl (including autodetect for v5), msr, xhv, rto
// for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
"asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, off "asm-optimization" : "auto", // specificy the ASM optimization to use: -> auto (default), intel, ryzen, bulldozer, off
"background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead) "background": false, // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
"colors": true, // false to disable colored output "colors": true, // false to disable colored output
"cpu-affinity": null, // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1 "cpu-affinity": null, // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1