diff --git a/src/AsmOptimization.h b/src/AsmOptimization.h index 2c441422..ec41381a 100644 --- a/src/AsmOptimization.h +++ b/src/AsmOptimization.h @@ -26,6 +26,7 @@ enum AsmOptimization ASM_AUTODETECT, ASM_INTEL, ASM_RYZEN, + ASM_BULLDOZER, ASM_OFF }; @@ -37,6 +38,8 @@ inline std::string getAsmOptimizationName(AsmOptimization asmOptimization) return "INTEL"; case ASM_RYZEN: return "RYZEN"; + case ASM_BULLDOZER: + return "BULLDOZER"; case ASM_OFF: return "OFF"; case ASM_AUTODETECT: @@ -62,7 +65,11 @@ inline AsmOptimization parseAsmOptimization(int optimization) case 2: asmOptimization = AsmOptimization::ASM_RYZEN; break; + case 3: + asmOptimization = AsmOptimization::ASM_AUTODETECT; + break; default: + asmOptimization = AsmOptimization::ASM_AUTODETECT; break; } @@ -79,6 +86,8 @@ inline AsmOptimization parseAsmOptimization(const std::string optimization) asmOptimization = AsmOptimization::ASM_INTEL; } else if (optimization == "2" || optimization == "ryzen") { asmOptimization = AsmOptimization::ASM_RYZEN; + } else if (optimization == "3" || optimization == "bulldozer") { + asmOptimization = AsmOptimization::ASM_RYZEN; } return asmOptimization; diff --git a/src/Cpu_cpuid.cpp b/src/Cpu_cpuid.cpp index b701a994..dfca80f1 100644 --- a/src/Cpu_cpuid.cpp +++ b/src/Cpu_cpuid.cpp @@ -82,8 +82,12 @@ void CpuImpl::initCommon() } # ifndef XMRIG_NO_ASM - if (data.vendor == VENDOR_AMD && data.ext_family >= 0x17) { - m_asmOptimization = AsmOptimization::ASM_RYZEN; + if (data.vendor == VENDOR_AMD) { + if (data.ext_family >= 0x17) { + m_asmOptimization = AsmOptimization::ASM_RYZEN; + } else if (data.ext_family >= 0x15) { + m_asmOptimization = AsmOptimization::ASM_BULLDOZER; + } } else if (data.vendor == VENDOR_INTEL && ((data.ext_family >= 0x06 && data.ext_model > 0x2) || (data.ext_family >= 0x06 && data.ext_model == 0x2 && data.model >= 0xA))) { diff --git a/src/Options.cpp b/src/Options.cpp index c73ddda9..3be979ec 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -73,9 +73,9 @@ Options:\n" -k, --keepalive send keepalived for prevent timeout (need pool support)\n\ -r, --retries=N number of times to retry before switch to backup server (default: 5)\n\ -R, --retry-pause=N time to pause between retries (default: 5)\n\ - --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'alloy', 'xtl' (including autodetect for v5)\n\ + --pow-variant=V specificy the PoW variat to use: -> 'auto' (default), '0' (v0), '1' (v1, aka cnv7), '2' (v2, aka cnv8), 'ipbc' (tube), 'xao', 'xtl' (including autodetect for v5)\n\ for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\ - --asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'off' \n\ + --asm-optimization=V specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'bulldozer', 'off' \n\ --multihash-factor=N number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\ --multihash-thread-mask=MASK limits multihash to given threads (mask), (default: all threads)\n\ --cpu-affinity set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\ @@ -287,7 +287,7 @@ constexpr static const char *pow_variant_names[] = { "1", "2", "tube", - "alloy", + "xao", "xtl", "msr", "xhv", @@ -298,6 +298,7 @@ constexpr static const char *asm_optimization_names[] = { "auto", "intel", "ryzen", + "bulldozer", "off" }; @@ -1049,6 +1050,11 @@ bool Options::parsePowVariant(const char *powVariant) break; } + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "xao") || !strcmp(powVariant, "alloy"))) { + m_powVariant = POW_ALLOY; + break; + } + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "ipbc") || !strcmp(powVariant, "bittube"))) { m_powVariant = POW_TUBE; break; diff --git a/src/PowVariant.h b/src/PowVariant.h index 0bde83d6..8a01bee8 100644 --- a/src/PowVariant.h +++ b/src/PowVariant.h @@ -50,7 +50,7 @@ inline std::string getPowVariantName(PowVariant powVariant) case POW_TUBE: return "tube"; case POW_ALLOY: - return "alloy"; + return "xao"; case POW_XTL: return "xtl"; case POW_MSR: diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index e7589738..a92ee8a6 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -50,14 +50,16 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer #if defined(XMRIG_ARM) CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else - if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1)) { + if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || + (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1) || + (asmOptimization == AsmOptimization::ASM_BULLDOZER && NUM_HASH_BLOCKS == 1)) { CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization); } else { CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif } else if (powVersion == PowVariant::POW_ALLOY) { - CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_XTL) { #if defined(XMRIG_ARM) CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); @@ -69,7 +71,11 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer } #endif } else if (powVersion == PowVariant::POW_MSR) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } } else if (powVersion == PowVariant::POW_RTO) { CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); }else { @@ -113,7 +119,11 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV } #endif } else if (powVersion == PowVariant::POW_MSR) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization); + } else { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } } else if (powVersion == PowVariant::POW_RTO) { CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); } else { @@ -464,25 +474,15 @@ bool CryptoNight::selfTest(int algo) cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 32) == 0; - #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); - result = result && memcmp(output, test_output_xtl, 64) == 0; - #endif + // cnv7 + msr aka cn-fast - #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); - result = result && memcmp(output, test_output_xtl, 96) == 0; - #endif + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_MSR,test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_msr, 32) == 0; - #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); - result = result && memcmp(output, test_output_xtl, 128) == 0; - #endif + // cnv7 + alloy - #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads); - result = result && memcmp(output, test_output_xtl, 160) == 0; - #endif + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_ALLOY,test_input, 76, output, scratchPads); + result = result && memcmp(output, test_output_alloy, 32) == 0; // cn v8 aka cnv2 diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 2952f140..3f98992a 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -99,20 +99,24 @@ const static uint8_t test_output_v2[160] = { }; // CN XTL -const static uint8_t test_output_xtl[160] = { +const static uint8_t test_output_xtl[32] = { 0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3, - 0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1, - 0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90, - 0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF, - 0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3, - 0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86, - 0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D, - 0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF, - 0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA, - 0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48 + 0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1 }; +// CN MSR +const static uint8_t test_output_msr[32] = { + 0x3C, 0x7A, 0x61, 0x08, 0x4C, 0x5E, 0xB8, 0x65, 0xB4, 0x98, 0xAB, 0x2F, 0x5A, 0x1A, 0xC5, 0x2C, + 0x49, 0xC1, 0x77, 0xC2, 0xD0, 0x13, 0x34, 0x42, 0xD6, 0x5E, 0xD5, 0x14, 0x33, 0x5C, 0x82, 0xC5 +}; +// CN ALLOY +const static uint8_t test_output_alloy[32] = { + 0x9A, 0x29, 0xD0, 0xC4, 0xAF, 0xDC, 0x63, 0x9B, 0x65, 0x53, 0xB1, 0xC8, 0x37, 0x35, 0x11, 0x4C, + 0x5D, 0x77, 0x16, 0x21, 0x42, 0x97, 0x5C, 0xB8, 0x50, 0xC0, 0xA5, 0x1F, 0x64, 0x07, 0xBD, 0x33 +}; + +// CN-LITE const static uint8_t test_output_v0_lite[160] = { 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index eca53c83..387794cd 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -52,10 +52,13 @@ extern "C" #ifndef XMRIG_NO_ASM void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0); void cn_litev1_mainloop_sandybridge_asm(ScratchPad* ctx0); + void cn_fast_mainloop_sandybridge_asm(ScratchPad* ctx0); void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0); void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0); + void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0); void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); #endif @@ -1424,14 +1427,22 @@ public: if (ITERATIONS == 0x80000) { cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); - } else if (ITERATIONS == 0x40000){ - cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } else if (ITERATIONS == 0x40000) { + if (MASK == 0x1FFFF0) { + cn_fast_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } else { + cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + } } } else { if (ITERATIONS == 0x80000) { cnv1_mainloop_sandybridge_asm(scratchPad[0]); - } else if (ITERATIONS == 0x40000){ - cn_litev1_mainloop_sandybridge_asm(scratchPad[0]); + } else if (ITERATIONS == 0x40000) { + if (MASK == 0x1FFFF0) { + cn_fast_mainloop_sandybridge_asm(scratchPad[0]); + } else { + cn_litev1_mainloop_sandybridge_asm(scratchPad[0]); + } } } #endif @@ -1538,6 +1549,8 @@ public: } } else if (asmOptimization == AsmOptimization::ASM_RYZEN) { cnv2_mainloop_ryzen_asm(scratchPad[0]); + } else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) { + cnv2_mainloop_bulldozer_asm(scratchPad[0]); } #endif diff --git a/src/crypto/asm/cn_fast_mainloop_sandybridge.inc b/src/crypto/asm/cn_fast_mainloop_sandybridge.inc new file mode 100644 index 00000000..d864952b --- /dev/null +++ b/src/crypto/asm/cn_fast_mainloop_sandybridge.inc @@ -0,0 +1,74 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r14 + push r15 + mov rax, QWORD PTR [rcx+48] + mov ebp, 262144 + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm3, rax + mov rax, QWORD PTR [rcx+256] + mov rdi, QWORD PTR [rcx+40] + movq xmm0, rdx + xor rdi, QWORD PTR [rcx+8] + mov rdx, r8 + mov r15, QWORD PTR [rcx+264] + and edx, 2097136 + mov r14, QWORD PTR [rax+35] + xor r14, QWORD PTR [rcx+192] + mov rsi, QWORD PTR [rcx+224] + punpcklqdq xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cn_fast_mainloop_sandybridge: + movq xmm0, rdi + movq xmm1, r8 + punpcklqdq xmm1, xmm0 + aesenc xmm2, xmm1 + movq r10, xmm2 + mov r9d, r10d + and r9d, 2097136 + add r9, rsi + movdqa xmm0, xmm2 + pxor xmm0, xmm3 + movdqa xmm3, xmm2 + movdqu XMMWORD PTR [rdx+rsi], xmm0 + psrldq xmm0, 11 + movq rax, xmm0 + movzx eax, al + movzx eax, BYTE PTR [rax+r15] + mov BYTE PTR [rsi+rdx+11], al + mov rbx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + mul r10 + add r8, rdx + mov QWORD PTR [r9], r8 + add rdi, rax + mov rax, r14 + xor rax, rdi + mov QWORD PTR [r9+8], rax + xor r8, rbx + mov rdx, r8 + and edx, 2097136 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + xor rdi, r11 + dec ebp + jne cnv1_mainloop_sandybridge + + mov rbx, QWORD PTR [rsp+24] + mov rbp, QWORD PTR [rsp+32] + mov rsi, QWORD PTR [rsp+40] + mov rdi, QWORD PTR [rsp+48] + pop r15 + pop r14 diff --git a/src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..0ded4f5c --- /dev/null +++ b/src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,166 @@ + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 72 + + movaps XMMWORD PTR [rsp], xmm6 + movaps XMMWORD PTR [rsp+16], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + movaps XMMWORD PTR [rsp+48], xmm9 + + mov rax, QWORD PTR [rcx+48] + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm4, rax + mov rax, QWORD PTR [rcx+256] + mov r13, QWORD PTR [rcx+40] + movq xmm0, rdx + xor r13, QWORD PTR [rcx+8] + mov rdx, r8 + mov rdi, QWORD PTR [rcx+224] + and edx, 2097136 + mov rax, QWORD PTR [rax+35] + xor rax, QWORD PTR [rcx+192] + movq xmm5, rax + movq xmm8, rdi + punpcklqdq xmm4, xmm0 + mov QWORD PTR [rsp+64], rdx + + movq xmm6, rcx + mov rax, QWORD PTR [rcx+264] + movq xmm7, rax + + mov eax, 262144 + + #ifdef __APPLE__ + ALIGN 16 + #else + ALIGN 64 + #endif +cn_fast_mainloop_soft_aes_sandybridge: + movq xmm9, rax + mov r12, QWORD PTR [rcx+272] + mov esi, DWORD PTR [rdx+rdi] + mov r10d, DWORD PTR [rdx+rdi+4] + mov ebp, DWORD PTR [rdx+rdi+12] + mov r14d, DWORD PTR [rdx+rdi+8] + mov rdx, QWORD PTR [rsp+64] + movzx ecx, sil + shr esi, 8 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + add ebp, 256 + movd xmm1, r11d + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movq rdi, xmm8 + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + punpckldq xmm2, xmm1 + movq xmm1, r8 + xor eax, DWORD PTR [r12+rcx*4] + xor eax, r15d + movd xmm3, eax + movq rax, xmm7 + punpckldq xmm3, xmm0 + movq xmm0, r13 + punpcklqdq xmm1, xmm0 + punpckldq xmm3, xmm2 + pxor xmm3, xmm1 + movq r9, xmm3 + mov r10d, r9d + and r10d, 2097136 + movdqa xmm0, xmm3 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx+rdi], xmm0 + psrldq xmm0, 11 + movq rcx, xmm0 + movzx ecx, cl + mov cl, BYTE PTR [rcx+rax] + mov BYTE PTR [rdi+rdx+11], cl + mov rbx, QWORD PTR [r10+rdi] + mov rcx, r9 + lea r9, QWORD PTR [r10+rdi] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + movdqa xmm4, xmm3 + mul rcx + movq rcx, xmm6 + add r8, rdx + add r13, rax + movq rax, xmm5 + xor rax, r13 + mov QWORD PTR [r9], r8 + xor r8, rbx + mov QWORD PTR [r9+8], rax + movq rax, xmm9 + mov rdx, r8 + xor r13, r11 + and edx, 2097136 + mov QWORD PTR [rsp+64], rdx + sub eax, 1 + jne cnv1_mainloop_soft_aes_sandybridge + + movaps xmm6, XMMWORD PTR [rsp] + movaps xmm7, XMMWORD PTR [rsp+16] + movaps xmm8, XMMWORD PTR [rsp+32] + movaps xmm9, XMMWORD PTR [rsp+48] + + add rsp, 72 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index c8087dcc..ec139a5f 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -9,12 +9,15 @@ #endif .global FN_PREFIX(cnv1_mainloop_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) #ifdef __APPLE__ @@ -41,6 +44,18 @@ FN_PREFIX(cn_litev1_mainloop_sandybridge_asm): add rsp, 48 ret 0 +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_fast_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_fast_mainloop_sandybridge.inc" + add rsp, 48 + ret 0 + #ifdef __APPLE__ ALIGN 16 #else @@ -65,6 +80,18 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): add rsp, 48 ret 0 +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_mainloop_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_bulldozer.inc" + add rsp, 48 + ret 0 + #ifdef __APPLE__ ALIGN 16 #else @@ -102,6 +129,18 @@ FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm): add rsp, 48 ret 0 +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cn_fast_mainloop_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 + #ifdef __APPLE__ ALIGN 16 #else diff --git a/src/crypto/asm/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/cnv2_main_loop_bulldozer.inc new file mode 100644 index 00000000..1b2de354 --- /dev/null +++ b/src/crypto/asm/cnv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movq r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movq xmm0, rax + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_bulldozer + shr rdi, 19 + +sqrt_fixup_bulldozer_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_bulldozer_endp + +sqrt_fixup_bulldozer: + movq r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_bulldozer_ret + +cnv2_main_loop_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc b/src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc new file mode 100644 index 00000000..7671930a --- /dev/null +++ b/src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc @@ -0,0 +1,70 @@ + mov QWORD PTR [rsp+8], rbx + mov QWORD PTR [rsp+16], rbp + mov QWORD PTR [rsp+24], rsi + mov QWORD PTR [rsp+32], rdi + push r14 + push r15 + mov rax, QWORD PTR [rcx+48] + mov ebp, 262144 + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm3, rax + mov rax, QWORD PTR [rcx+256] + mov rdi, QWORD PTR [rcx+40] + movq xmm0, rdx + xor rdi, QWORD PTR [rcx+8] + mov rdx, r8 + mov r15, QWORD PTR [rcx+264] + and edx, 2097136 + mov r14, QWORD PTR [rax+35] + xor r14, QWORD PTR [rcx+192] + mov rsi, QWORD PTR [rcx+224] + punpcklqdq xmm3, xmm0 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + + ALIGN 64 +cn_fast_mainloop_sandybridge: + movq xmm0, rdi + movq xmm1, r8 + punpcklqdq xmm1, xmm0 + aesenc xmm2, xmm1 + movq r10, xmm2 + mov r9d, r10d + and r9d, 2097136 + add r9, rsi + movdqa xmm0, xmm2 + pxor xmm0, xmm3 + movdqa xmm3, xmm2 + movdqu XMMWORD PTR [rdx+rsi], xmm0 + psrldq xmm0, 11 + movq rax, xmm0 + movzx eax, al + movzx eax, BYTE PTR [rax+r15] + mov BYTE PTR [rsi+rdx+11], al + mov rbx, QWORD PTR [r9] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + mul r10 + add r8, rdx + mov QWORD PTR [r9], r8 + add rdi, rax + mov rax, r14 + xor rax, rdi + mov QWORD PTR [r9+8], rax + xor r8, rbx + mov rdx, r8 + and edx, 2097136 + movdqu xmm2, XMMWORD PTR [rdx+rsi] + xor rdi, r11 + dec ebp + jne cnv1_mainloop_sandybridge + + mov rbx, QWORD PTR [rsp+24] + mov rbp, QWORD PTR [rsp+32] + mov rsi, QWORD PTR [rsp+40] + mov rdi, QWORD PTR [rsp+48] + pop r15 + pop r14 diff --git a/src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc new file mode 100644 index 00000000..b80fd471 --- /dev/null +++ b/src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc @@ -0,0 +1,162 @@ + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 72 + + movaps XMMWORD PTR [rsp], xmm6 + movaps XMMWORD PTR [rsp+16], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + movaps XMMWORD PTR [rsp+48], xmm9 + + mov rax, QWORD PTR [rcx+48] + xor rax, QWORD PTR [rcx+16] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + movq xmm4, rax + mov rax, QWORD PTR [rcx+256] + mov r13, QWORD PTR [rcx+40] + movq xmm0, rdx + xor r13, QWORD PTR [rcx+8] + mov rdx, r8 + mov rdi, QWORD PTR [rcx+224] + and edx, 2097136 + mov rax, QWORD PTR [rax+35] + xor rax, QWORD PTR [rcx+192] + movq xmm5, rax + movq xmm8, rdi + punpcklqdq xmm4, xmm0 + mov QWORD PTR [rsp+64], rdx + + movq xmm6, rcx + mov rax, QWORD PTR [rcx+264] + movq xmm7, rax + + mov eax, 262144 + + ALIGN 64 +cn_fast_mainloop_soft_aes_sandybridge: + movq xmm9, rax + mov r12, QWORD PTR [rcx+272] + mov esi, DWORD PTR [rdx+rdi] + mov r10d, DWORD PTR [rdx+rdi+4] + mov ebp, DWORD PTR [rdx+rdi+12] + mov r14d, DWORD PTR [rdx+rdi+8] + mov rdx, QWORD PTR [rsp+64] + movzx ecx, sil + shr esi, 8 + mov r15d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + mov edi, DWORD PTR [r12+rcx*4] + movzx ecx, r14b + shr r14d, 8 + mov ebx, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + shr ebp, 8 + mov r9d, DWORD PTR [r12+rcx*4] + movzx ecx, r10b + shr r10d, 8 + xor r15d, DWORD PTR [r12+rcx*4+1024] + movzx ecx, r14b + shr r14d, 8 + mov eax, r14d + shr eax, 8 + xor edi, DWORD PTR [r12+rcx*4+1024] + add eax, 256 + movzx ecx, bpl + shr ebp, 8 + xor ebx, DWORD PTR [r12+rcx*4+1024] + movzx ecx, sil + shr esi, 8 + xor r9d, DWORD PTR [r12+rcx*4+1024] + add r12, 2048 + movzx ecx, r10b + shr r10d, 8 + add r10d, 256 + mov r11d, DWORD PTR [r12+rax*4] + xor r11d, DWORD PTR [r12+rcx*4] + xor r11d, r9d + movzx ecx, sil + mov r10d, DWORD PTR [r12+r10*4] + shr esi, 8 + add esi, 256 + xor r10d, DWORD PTR [r12+rcx*4] + movzx ecx, bpl + xor r10d, ebx + shr ebp, 8 + add ebp, 256 + movd xmm1, r11d + mov r9d, DWORD PTR [r12+rcx*4] + xor r9d, DWORD PTR [r12+rsi*4] + mov eax, DWORD PTR [r12+rbp*4] + xor r9d, edi + movq rdi, xmm8 + movzx ecx, r14b + movd xmm0, r10d + movd xmm2, r9d + punpckldq xmm2, xmm1 + movq xmm1, r8 + xor eax, DWORD PTR [r12+rcx*4] + xor eax, r15d + movd xmm3, eax + movq rax, xmm7 + punpckldq xmm3, xmm0 + movq xmm0, r13 + punpcklqdq xmm1, xmm0 + punpckldq xmm3, xmm2 + pxor xmm3, xmm1 + movq r9, xmm3 + mov r10d, r9d + and r10d, 2097136 + movdqa xmm0, xmm3 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx+rdi], xmm0 + psrldq xmm0, 11 + movq rcx, xmm0 + movzx ecx, cl + mov cl, BYTE PTR [rcx+rax] + mov BYTE PTR [rdi+rdx+11], cl + mov rbx, QWORD PTR [r10+rdi] + mov rcx, r9 + lea r9, QWORD PTR [r10+rdi] + mov r11, QWORD PTR [r9+8] + mov rax, rbx + movdqa xmm4, xmm3 + mul rcx + movq rcx, xmm6 + add r8, rdx + add r13, rax + movq rax, xmm5 + xor rax, r13 + mov QWORD PTR [r9], r8 + xor r8, rbx + mov QWORD PTR [r9+8], rax + movq rax, xmm9 + mov rdx, r8 + xor r13, r11 + and edx, 2097136 + mov QWORD PTR [rsp+64], rdx + sub eax, 1 + jne cnv1_mainloop_soft_aes_sandybridge + + movaps xmm6, XMMWORD PTR [rsp] + movaps xmm7, XMMWORD PTR [rsp+16] + movaps xmm8, XMMWORD PTR [rsp+32] + movaps xmm9, XMMWORD PTR [rsp+48] + + add rsp, 72 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx diff --git a/src/crypto/asm/win/cn_main_loop.asm b/src/crypto/asm/win/cn_main_loop.asm index 18aaba3a..b2b27099 100644 --- a/src/crypto/asm/win/cn_main_loop.asm +++ b/src/crypto/asm/win/cn_main_loop.asm @@ -1,12 +1,15 @@ _TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE PUBLIC cnv1_mainloop_sandybridge_asm PUBLIC cn_litev1_mainloop_sandybridge_asm +PUBLIC cn_fast_mainloop_sandybridge_asm PUBLIC cnv2_mainloop_ivybridge_asm PUBLIC cnv2_mainloop_ryzen_asm +PUBLIC cnv2_mainloop_bulldozer_asm PUBLIC cnv2_double_mainloop_sandybridge_asm PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm +PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm ALIGN 64 @@ -21,6 +24,12 @@ cn_litev1_mainloop_sandybridge_asm PROC ret 0 cn_litev1_mainloop_sandybridge_asm ENDP +ALIGN 64 +cn_fast_mainloop_sandybridge_asm PROC + INCLUDE cn_fast_mainloop_sandybridge.inc + ret 0 +cn_fast_mainloop_sandybridge_asm ENDP + ALIGN 64 cnv2_mainloop_ivybridge_asm PROC INCLUDE cnv2_main_loop_ivybridge.inc @@ -33,6 +42,12 @@ cnv2_mainloop_ryzen_asm PROC ret 0 cnv2_mainloop_ryzen_asm ENDP +ALIGN 64 +cnv2_mainloop_bulldozer_asm PROC + INCLUDE cnv2_main_loop_bulldozer.inc + ret 0 +cnv2_mainloop_bulldozer_asm ENDP + ALIGN 64 cnv2_double_mainloop_sandybridge_asm PROC INCLUDE cnv2_double_main_loop_sandybridge.inc @@ -51,6 +66,12 @@ cn_litev1_mainloop_soft_aes_sandybridge_asm PROC ret 0 cn_litev1_mainloop_soft_aes_sandybridge_asm ENDP +ALIGN 64 +cn_fast_mainloop_soft_aes_sandybridge_asm PROC + INCLUDE cn_fast_mainloop_soft_aes_sandybridge.inc + ret 0 +cn_fast_mainloop_soft_aes_sandybridge_asm ENDP + ALIGN 64 cnv2_mainloop_soft_aes_sandybridge_asm PROC INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc diff --git a/src/crypto/asm/win/cn_main_loop_win_gcc.S b/src/crypto/asm/win/cn_main_loop_win_gcc.S index b9261409..a550868c 100644 --- a/src/crypto/asm/win/cn_main_loop_win_gcc.S +++ b/src/crypto/asm/win/cn_main_loop_win_gcc.S @@ -5,12 +5,15 @@ .global FN_PREFIX(cnv1_mainloop_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) +.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) .global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) ALIGN 64 @@ -23,6 +26,11 @@ FN_PREFIX(cn_litev1_mainloop_sandybridge_asm): #include "../cn_litev1_mainloop_sandybridge.inc" ret 0 +ALIGN 64 +FN_PREFIX(cn_fast_mainloop_sandybridge_asm): + #include "../cn_fast_mainloop_sandybridge.inc" + ret 0 + ALIGN 64 FN_PREFIX(cnv2_mainloop_ivybridge_asm): #include "../cnv2_main_loop_ivybridge.inc" @@ -33,6 +41,11 @@ FN_PREFIX(cnv2_mainloop_ryzen_asm): #include "../cnv2_main_loop_ryzen.inc" ret 0 +ALIGN 64 +FN_PREFIX(cnv2_mainloop_bulldozer_asm): + #include "../cnv2_main_loop_bulldozer.inc" + ret 0 + ALIGN 64 FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): #include "../cnv2_double_main_loop_sandybridge.inc" @@ -48,6 +61,11 @@ FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm): #include "../cn_litev1_mainloop_soft_aes_sandybridge.inc" ret 0 +ALIGN 64 +FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm): + #include "../cn_fast_mainloop_soft_aes_sandybridge.inc" + ret 0 + ALIGN 64 FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): #include "../cnv2_mainloop_soft_aes_sandybridge.inc" diff --git a/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc new file mode 100644 index 00000000..55452cad --- /dev/null +++ b/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc @@ -0,0 +1,180 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movd xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movd xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movd xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movd xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movd xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +cnv2_main_loop_bulldozer: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movd xmm6, r8 + pinsrq xmm6, r11, 1 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + + mov edi, 1023 + shl rdi, 52 + + movd r14, xmm5 + pextrq rax, xmm5, 1 + + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + div r9 + mov eax, eax + shl rdx, 32 + lea r15, [rax+rdx] + lea rax, [r14+r15] + shr rax, 12 + add rax, rdi + movd xmm0, rax + sqrtsd xmm1, xmm0 + movd rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_bulldozer + shr rdi, 19 + +sqrt_fixup_bulldozer_ret: + mov rax, rsi + mul r14 + movd xmm1, rax + movd xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne cnv2_main_loop_bulldozer + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_bulldozer_endp + +sqrt_fixup_bulldozer: + movd r9, xmm5 + add r9, r15 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_bulldozer_ret + +cnv2_main_loop_bulldozer_endp: \ No newline at end of file