Add ASM optimizations
- Add ASM optimization for CN-FAST on INTEL - Add ASM optimization for CNV2 on AMD Bulldozer - Alloy is now announced as XAO
This commit is contained in:
parent
f94d00f1d0
commit
e655bdfc5e
16 changed files with 986 additions and 40 deletions
|
@ -26,6 +26,7 @@ enum AsmOptimization
|
|||
ASM_AUTODETECT,
|
||||
ASM_INTEL,
|
||||
ASM_RYZEN,
|
||||
ASM_BULLDOZER,
|
||||
ASM_OFF
|
||||
};
|
||||
|
||||
|
@ -37,6 +38,8 @@ inline std::string getAsmOptimizationName(AsmOptimization asmOptimization)
|
|||
return "INTEL";
|
||||
case ASM_RYZEN:
|
||||
return "RYZEN";
|
||||
case ASM_BULLDOZER:
|
||||
return "BULLDOZER";
|
||||
case ASM_OFF:
|
||||
return "OFF";
|
||||
case ASM_AUTODETECT:
|
||||
|
@ -62,7 +65,11 @@ inline AsmOptimization parseAsmOptimization(int optimization)
|
|||
case 2:
|
||||
asmOptimization = AsmOptimization::ASM_RYZEN;
|
||||
break;
|
||||
case 3:
|
||||
asmOptimization = AsmOptimization::ASM_AUTODETECT;
|
||||
break;
|
||||
default:
|
||||
asmOptimization = AsmOptimization::ASM_AUTODETECT;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -79,6 +86,8 @@ inline AsmOptimization parseAsmOptimization(const std::string optimization)
|
|||
asmOptimization = AsmOptimization::ASM_INTEL;
|
||||
} else if (optimization == "2" || optimization == "ryzen") {
|
||||
asmOptimization = AsmOptimization::ASM_RYZEN;
|
||||
} else if (optimization == "3" || optimization == "bulldozer") {
|
||||
asmOptimization = AsmOptimization::ASM_RYZEN;
|
||||
}
|
||||
|
||||
return asmOptimization;
|
||||
|
|
|
@ -82,8 +82,12 @@ void CpuImpl::initCommon()
|
|||
}
|
||||
|
||||
# ifndef XMRIG_NO_ASM
|
||||
if (data.vendor == VENDOR_AMD && data.ext_family >= 0x17) {
|
||||
if (data.vendor == VENDOR_AMD) {
|
||||
if (data.ext_family >= 0x17) {
|
||||
m_asmOptimization = AsmOptimization::ASM_RYZEN;
|
||||
} else if (data.ext_family >= 0x15) {
|
||||
m_asmOptimization = AsmOptimization::ASM_BULLDOZER;
|
||||
}
|
||||
} else if (data.vendor == VENDOR_INTEL &&
|
||||
((data.ext_family >= 0x06 && data.ext_model > 0x2) ||
|
||||
(data.ext_family >= 0x06 && data.ext_model == 0x2 && data.model >= 0xA))) {
|
||||
|
|
|
@ -73,9 +73,9 @@ Options:\n"
|
|||
-k, --keepalive send keepalived for prevent timeout (need pool support)\n\
|
||||
-r, --retries=N number of times to retry before switch to backup server (default: 5)\n\
|
||||
-R, --retry-pause=N time to pause between retries (default: 5)\n\
|
||||
--pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'alloy', 'xtl' (including autodetect for v5)\n\
|
||||
--pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for v5)\n\
|
||||
for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\
|
||||
--asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'off' \n\
|
||||
--asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\
|
||||
--multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\
|
||||
--multihash-thread-mask=MASK limits multihash to given threads (mask), (default: all threads)\n\
|
||||
--cpu-affinity set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\
|
||||
|
@ -287,7 +287,7 @@ constexpr static const char *pow_variant_names[] = {
|
|||
"1",
|
||||
"2",
|
||||
"tube",
|
||||
"alloy",
|
||||
"xao",
|
||||
"xtl",
|
||||
"msr",
|
||||
"xhv",
|
||||
|
@ -298,6 +298,7 @@ constexpr static const char *asm_optimization_names[] = {
|
|||
"auto",
|
||||
"intel",
|
||||
"ryzen",
|
||||
"bulldozer",
|
||||
"off"
|
||||
};
|
||||
|
||||
|
@ -1049,6 +1050,11 @@ bool Options::parsePowVariant(const char *powVariant)
|
|||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "xao") || !strcmp(powVariant, "alloy"))) {
|
||||
m_powVariant = POW_ALLOY;
|
||||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "ipbc") || !strcmp(powVariant, "bittube"))) {
|
||||
m_powVariant = POW_TUBE;
|
||||
break;
|
||||
|
|
|
@ -50,7 +50,7 @@ inline std::string getPowVariantName(PowVariant powVariant)
|
|||
case POW_TUBE:
|
||||
return "tube";
|
||||
case POW_ALLOY:
|
||||
return "alloy";
|
||||
return "xao";
|
||||
case POW_XTL:
|
||||
return "xtl";
|
||||
case POW_MSR:
|
||||
|
|
|
@ -50,7 +50,9 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
|
|||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
#else
|
||||
if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1)) {
|
||||
if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) ||
|
||||
(asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) ||
|
||||
(asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) {
|
||||
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
|
||||
} else {
|
||||
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
|
@ -69,7 +71,11 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
|
|||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_MSR) {
|
||||
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
|
||||
} else {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
}
|
||||
} else if (powVersion == PowVariant::POW_RTO) {
|
||||
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
|
||||
}else {
|
||||
|
@ -113,7 +119,11 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV
|
|||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_MSR) {
|
||||
if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
|
||||
} else {
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
}
|
||||
} else if (powVersion == PowVariant::POW_RTO) {
|
||||
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
|
||||
} else {
|
||||
|
@ -464,25 +474,15 @@ bool CryptoNight::selfTest(int algo)
|
|||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xtl, 32) == 0;
|
||||
|
||||
#if MAX_NUM_HASH_BLOCKS > 1
|
||||
cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xtl, 64) == 0;
|
||||
#endif
|
||||
// cnv7 + msr aka cn-fast
|
||||
|
||||
#if MAX_NUM_HASH_BLOCKS > 2
|
||||
cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xtl, 96) == 0;
|
||||
#endif
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_MSR,test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_msr, 32) == 0;
|
||||
|
||||
#if MAX_NUM_HASH_BLOCKS > 3
|
||||
cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xtl, 128) == 0;
|
||||
#endif
|
||||
// cnv7 + alloy
|
||||
|
||||
#if MAX_NUM_HASH_BLOCKS > 4
|
||||
cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xtl, 160) == 0;
|
||||
#endif
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_ALLOY,test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_alloy, 32) == 0;
|
||||
|
||||
// cn v8 aka cnv2
|
||||
|
||||
|
|
|
@ -99,20 +99,24 @@ const static uint8_t test_output_v2[160] = {
|
|||
};
|
||||
|
||||
// CN XTL
|
||||
const static uint8_t test_output_xtl[160] = {
|
||||
const static uint8_t test_output_xtl[32] = {
|
||||
0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3,
|
||||
0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1,
|
||||
0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90,
|
||||
0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF,
|
||||
0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3,
|
||||
0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86,
|
||||
0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D,
|
||||
0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF,
|
||||
0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA,
|
||||
0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48
|
||||
0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1
|
||||
};
|
||||
|
||||
// CN MSR
|
||||
const static uint8_t test_output_msr[32] = {
|
||||
0x3C, 0x7A, 0x61, 0x08, 0x4C, 0x5E, 0xB8, 0x65, 0xB4, 0x98, 0xAB, 0x2F, 0x5A, 0x1A, 0xC5, 0x2C,
|
||||
0x49, 0xC1, 0x77, 0xC2, 0xD0, 0x13, 0x34, 0x42, 0xD6, 0x5E, 0xD5, 0x14, 0x33, 0x5C, 0x82, 0xC5
|
||||
};
|
||||
|
||||
// CN ALLOY
|
||||
const static uint8_t test_output_alloy[32] = {
|
||||
0x9A, 0x29, 0xD0, 0xC4, 0xAF, 0xDC, 0x63, 0x9B, 0x65, 0x53, 0xB1, 0xC8, 0x37, 0x35, 0x11, 0x4C,
|
||||
0x5D, 0x77, 0x16, 0x21, 0x42, 0x97, 0x5C, 0xB8, 0x50, 0xC0, 0xA5, 0x1F, 0x64, 0x07, 0xBD, 0x33
|
||||
};
|
||||
|
||||
// CN-LITE
|
||||
const static uint8_t test_output_v0_lite[160] = {
|
||||
0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
|
||||
0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
|
||||
|
|
|
@ -52,10 +52,13 @@ extern "C"
|
|||
#ifndef XMRIG_NO_ASM
|
||||
void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_litev1_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fast_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
#endif
|
||||
|
@ -1425,15 +1428,23 @@ public:
|
|||
if (ITERATIONS == 0x80000) {
|
||||
cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
if (MASK == 0x1FFFF0) {
|
||||
cn_fast_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (ITERATIONS == 0x80000) {
|
||||
cnv1_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
if (MASK == 0x1FFFF0) {
|
||||
cn_fast_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cn_litev1_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
|
||||
|
@ -1538,6 +1549,8 @@ public:
|
|||
}
|
||||
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
|
||||
cnv2_mainloop_ryzen_asm(scratchPad[0]);
|
||||
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
|
||||
cnv2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
74
src/crypto/asm/cn_fast_mainloop_sandybridge.inc
Normal file
74
src/crypto/asm/cn_fast_mainloop_sandybridge.inc
Normal file
|
@ -0,0 +1,74 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 262144
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 2097136
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_fast_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 2097136
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cnv1_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
166
src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc
Normal file
166
src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc
Normal file
|
@ -0,0 +1,166 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 262144
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_fast_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 2097136
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cnv1_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -9,12 +9,15 @@
|
|||
#endif
|
||||
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||
|
||||
#ifdef __APPLE__
|
||||
|
@ -41,6 +44,18 @@ FN_PREFIX(cn_litev1_mainloop_sandybridge_asm):
|
|||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fast_mainloop_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fast_mainloop_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
|
@ -65,6 +80,18 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
|||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_bulldozer.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
|
@ -102,6 +129,18 @@ FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm):
|
|||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fast_mainloop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
|
|
180
src/crypto/asm/cnv2_main_loop_bulldozer.inc
Normal file
180
src/crypto/asm/cnv2_main_loop_bulldozer.inc
Normal file
|
@ -0,0 +1,180 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movq r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movq xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_bulldozer_endp
|
||||
|
||||
sqrt_fixup_bulldozer:
|
||||
movq r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_bulldozer_endp:
|
70
src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc
Normal file
70
src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc
Normal file
|
@ -0,0 +1,70 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 262144
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 2097136
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 2097136
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cnv1_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
162
src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc
Normal file
162
src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc
Normal file
|
@ -0,0 +1,162 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 262144
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 2097136
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cnv1_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -1,12 +1,15 @@
|
|||
_TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC cnv1_mainloop_sandybridge_asm
|
||||
PUBLIC cn_litev1_mainloop_sandybridge_asm
|
||||
PUBLIC cn_fast_mainloop_sandybridge_asm
|
||||
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||
|
||||
PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm
|
||||
|
||||
ALIGN 64
|
||||
|
@ -21,6 +24,12 @@ cn_litev1_mainloop_sandybridge_asm PROC
|
|||
ret 0
|
||||
cn_litev1_mainloop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_fast_mainloop_sandybridge.inc
|
||||
ret 0
|
||||
cn_fast_mainloop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_ivybridge.inc
|
||||
|
@ -33,6 +42,12 @@ cnv2_mainloop_ryzen_asm PROC
|
|||
ret 0
|
||||
cnv2_mainloop_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cnv2_main_loop_bulldozer.inc
|
||||
ret 0
|
||||
cnv2_mainloop_bulldozer_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cnv2_double_main_loop_sandybridge.inc
|
||||
|
@ -51,6 +66,12 @@ cn_litev1_mainloop_soft_aes_sandybridge_asm PROC
|
|||
ret 0
|
||||
cn_litev1_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_fast_mainloop_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_fast_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc
|
||||
|
|
|
@ -5,12 +5,15 @@
|
|||
|
||||
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||
|
||||
ALIGN 64
|
||||
|
@ -23,6 +26,11 @@ FN_PREFIX(cn_litev1_mainloop_sandybridge_asm):
|
|||
#include "../cn_litev1_mainloop_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast_mainloop_sandybridge_asm):
|
||||
#include "../cn_fast_mainloop_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||
#include "../cnv2_main_loop_ivybridge.inc"
|
||||
|
@ -33,6 +41,11 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
|||
#include "../cnv2_main_loop_ryzen.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||
#include "../cnv2_main_loop_bulldozer.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
||||
#include "../cnv2_double_main_loop_sandybridge.inc"
|
||||
|
@ -48,6 +61,11 @@ FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm):
|
|||
#include "../cn_litev1_mainloop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_fast_mainloop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cnv2_mainloop_soft_aes_sandybridge.inc"
|
||||
|
|
180
src/crypto/asm/win/cnv2_main_loop_bulldozer.inc
Normal file
180
src/crypto/asm/win/cnv2_main_loop_bulldozer.inc
Normal file
|
@ -0,0 +1,180 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movd xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movd r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movd xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movd rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movd xmm1, rax
|
||||
movd xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_bulldozer_endp
|
||||
|
||||
sqrt_fixup_bulldozer:
|
||||
movd r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_bulldozer_endp:
|
Loading…
Add table
Add a link
Reference in a new issue