diff --git a/CMakeLists.txt b/CMakeLists.txt index 11810617..6a435e18 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -219,24 +219,7 @@ else() endif(WITH_CC_SERVER OR WITH_CC_CLIENT) if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) - if (CMAKE_C_COMPILER_ID MATCHES MSVC) - enable_language(ASM_MASM) - set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm") - set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) - else() - enable_language(ASM) - - if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) - set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S") - else() - set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S") - endif() - - set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) - endif() - - add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE}) - set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C) + include(cmake/asm.cmake) else() add_definitions(/DXMRIG_NO_ASM) endif(WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) diff --git a/cmake/asm.cmake b/cmake/asm.cmake new file mode 100644 index 00000000..15a265b5 --- /dev/null +++ b/cmake/asm.cmake @@ -0,0 +1,115 @@ +# CN v1 original +set(ALGO "original") +set(ITERATIONS "524288") #0x80000 +set(MASK "2097136") #0x1FFFF0 + +configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_sandybridge.inc") +configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc") + +# CN v2 ORIGINAL +set(ALGO "originalv2") +set(ITERATIONS "524288") #0x80000 +set(MASK "2097136") #0x1FFFF0 + +configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ivybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_bulldozer.inc") +configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_ryzen.inc") +configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_sandybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ivybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_bulldozer.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_ryzen.inc") +configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc") + +# CN v1 FAST +set(ALGO "fast") +set(ITERATIONS "262144") #0x40000 +set(MASK "2097136") #0x1FFFF0 + +configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_fast_sandybridge.inc") +configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_fast_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_fast_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_fast_soft_aes_sandybridge.inc") + +# CN v2 FAST +set(ALGO "fastv2") +set(ITERATIONS "262144") #0x40000 +set(MASK "2097136") #0x1FFFF0 + +configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_ivybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_bulldozer.inc") +configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_ryzen.inc") +configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_fastv2_sandybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_ivybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_bulldozer.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_ryzen.inc") +configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_fastv2_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc") + +# CN LITE + +set(ALGO "lite") +set(ITERATIONS "262144") #0x40000 +set(MASK "1048560") #0xFFFF0 + +configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_lite_sandybridge.inc") +configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_lite_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_lite_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_lite_soft_aes_sandybridge.inc") + +# CN UPX + +set(ALGO "upx") +set(ITERATIONS "131072") #0x20000 +set(MASK "1048560") #0xFFFF0 + +configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_upx_sandybridge.inc") +configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_upx_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_upx_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_upx_soft_aes_sandybridge.inc") + +# CN V2 ULTRALITE +set(ALGO "ultralite") +set(ITERATIONS "65536") #0x10000 +set(MASK "131056") #0x1FFF0 + +configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_ivybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_bulldozer.inc") +configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_ryzen.inc") +configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_ultralite_sandybridge.inc") +configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc") + +configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_ivybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_bulldozer.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_ryzen.inc") +configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc") +configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc") + +if (CMAKE_C_COMPILER_ID MATCHES MSVC) + enable_language(ASM_MASM) + set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm") + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) +else() + enable_language(ASM) + + if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) + set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S") + else() + set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S") + endif() + + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) +endif() + +add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE}) +set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C) \ No newline at end of file diff --git a/src/Options.cpp b/src/Options.cpp index c7d848c2..d4a11fe4 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -328,7 +328,7 @@ constexpr static const char *pow_variant_names[] = { "xhv", "rto", "xfh", - "xtlv9", + "fast2", "upx", "turtle" }; @@ -420,10 +420,7 @@ Options::Options(int argc, char **argv) : { m_pools.push_back(new Url()); - parseConfig(Platform::defaultConfigName()); - int key; - while (true) { key = getopt_long(argc, argv, short_options, options, nullptr); if (key < 0) { @@ -440,6 +437,10 @@ Options::Options(int argc, char **argv) : return; } + if (!m_pools[0]->isValid() && (!m_ccHost || m_ccPort == 0)) { + parseConfig(Platform::defaultConfigName()); + } + #ifdef XMRIG_CC_SERVER if (m_ccPort == 0) { fprintf(stderr, "No CC Server Port supplied. Exiting.\n"); @@ -1176,8 +1177,10 @@ bool Options::parsePowVariant(const char *powVariant) break; } - if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2") || !strcmp(powVariant, "half"))) { - m_powVariant = POW_XTL_V9; + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2") || + !strcmp(powVariant, "half") || !strcmp(powVariant, "msr2") || + !strcmp(powVariant, "xtlv9"))) { + m_powVariant = POW_FAST_2; break; } @@ -1186,7 +1189,7 @@ bool Options::parsePowVariant(const char *powVariant) break; } - if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "trtl")) { + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "trtl") || !strcmp(powVariant, "turtlev2") || !strcmp(powVariant, "pico"))) { m_powVariant = POW_TURTLE; break; } diff --git a/src/Options.h b/src/Options.h index 8c3e730a..a02044f6 100644 --- a/src/Options.h +++ b/src/Options.h @@ -29,6 +29,8 @@ #define MAX_NUM_HASH_BLOCKS 5 #endif +#define MAX_BLOB_SIZE 128 + #include #include diff --git a/src/PowVariant.h b/src/PowVariant.h index 3ec3e9e2..4213cece 100644 --- a/src/PowVariant.h +++ b/src/PowVariant.h @@ -35,7 +35,7 @@ enum PowVariant POW_XHV, POW_RTO, POW_XFH, - POW_XTL_V9, + POW_FAST_2, POW_UPX, POW_TURTLE, LAST_ITEM @@ -65,8 +65,8 @@ inline std::string getPowVariantName(PowVariant powVariant) return "rto"; case POW_XFH: return "xfh"; - case POW_XTL_V9: - return "xtlv9"; + case POW_FAST_2: + return "fast2"; case POW_UPX: return "upx"; case POW_TURTLE: @@ -138,11 +138,11 @@ inline PowVariant parseVariant(const std::string variant) powVariant = PowVariant::POW_RTO; } else if (variant == "xfh" || variant == "freehaven" || variant == "faven") { powVariant = PowVariant::POW_XFH; - } else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2" || variant == "half") { - powVariant = PowVariant::POW_XTL_V9; + } else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2" || variant == "half" || variant == "msr2" || variant == "fast2") { + powVariant = PowVariant::POW_FAST_2; } else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") { powVariant = PowVariant::POW_UPX; - } else if (variant == "turtle" || variant == "trtl") { + } else if (variant == "turtle" || variant == "trtl" || variant == "pico" || variant == "turtlev2") { powVariant = PowVariant::POW_TURTLE; } diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index e387bf15..dd24f90d 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -70,7 +70,7 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } #endif -} else if (powVersion == PowVariant::POW_XTL_V9) { +} else if (powVersion == PowVariant::POW_FAST_2) { #if defined(XMRIG_ARM) CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else @@ -124,7 +124,7 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif - } else if (powVersion == PowVariant::POW_XTL_V9) { + } else if (powVersion == PowVariant::POW_FAST_2) { #if defined(XMRIG_ARM) CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else @@ -237,6 +237,7 @@ static void cryptonight_super_lite_softaes(AsmOptimization asmOptimization, PowV template static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { +# if !defined(XMRIG_ARMv7) #if defined(XMRIG_ARM) CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); #else @@ -248,6 +249,7 @@ static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVar CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); } #endif +# endif } template @@ -642,7 +644,7 @@ bool CryptoNight::selfTest(int algo) // cnv8 + xtl aka cn-fast2 - cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL_V9, test_input, 76, output, scratchPads); + cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl_v9, 32) == 0; } diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index 9df769fb..b0e31ae6 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -1430,74 +1430,72 @@ public: // single inline static void hashPowV3(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - const uint8_t* l; - uint64_t* h; - uint64_t al; - uint64_t ah; - uint64_t idx; - __m128i bx0; - __m128i bx1; + keccak(input, (int) size, scratchPad[0]->state, 200); - keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + const uint8_t* l0 = scratchPad[0]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); - l = scratchPad[0]->memory; - h = reinterpret_cast(scratchPad[0]->state); + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + uint64_t al0 = h0[0] ^h0[4]; + uint64_t ah0 = h0[1] ^h0[5]; - al = h[0] ^ h[4]; - ah = h[1] ^ h[5]; - bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); - bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]); - idx = h[0] ^ h[4]; + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); - uint64_t division_result_xmm0 = h[12]; - uint64_t sqrt_result0 = h[13]; + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t division_result_xmm0 = h0[12]; + uint64_t sqrt_result0 = h0[13]; for (size_t i = 0; i < ITERATIONS; i++) { - const __m128i ax = _mm_set_epi64x(ah, al); + __m128i cx0; + const __m128i ax0 = _mm_set_epi64x(ah0, al0); - __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*) &l[idx & MASK], ax); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); } else { - cx = _mm_load_si128((__m128i*) &l[idx & MASK]); - cx = _mm_aesenc_si128(cx, ax); + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); } - SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax) + SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) - _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx)); + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); + + idx0 = EXTRACT64(cx0); uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[idx & MASK])[0]; - ch = ((uint64_t*) &l[idx & MASK])[1]; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - INTEGER_MATH_V2(0, cl, cx) + INTEGER_MATH_V2(0, cl, cx0); - lo = __umul128(idx, cl, &hi); + lo = __umul128(idx0, cl, &hi); - SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi) + SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi); - al += hi; - ah += lo; + al0 += hi; + ah0 += lo; - ((uint64_t*) &l[idx & MASK])[0] = al; - ((uint64_t*) &l[idx & MASK])[1] = ah; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah ^= ch; - al ^= cl; - idx = al; + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; - bx0 = cx; + bx10 = bx00; + bx00 = cx0; } - cn_implode_scratchpad((__m128i*) l, (__m128i*) h); - keccakf(h, 24); + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + + keccakf(h0, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } @@ -2016,9 +2014,9 @@ public: // double inline static void hashPowV3(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - ScratchPad** __restrict__ scratchPad) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { keccak(input, (int) size, scratchPad[0]->state, 200); keccak(input + size, (int) size, scratchPad[1]->state, 200); @@ -2037,9 +2035,9 @@ public: uint64_t ah1 = h1[1] ^h1[5]; __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); - __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); uint64_t idx0 = h0[0] ^h0[4]; @@ -2048,15 +2046,16 @@ public: uint64_t division_result_xmm0 = h0[12]; uint64_t division_result_xmm1 = h1[12]; - uint64_t sqrt_result0 = h0[13]; - uint64_t sqrt_result1 = h1[13]; + uint64_t sqrt_result0 = h0[13]; + uint64_t sqrt_result1 = h1[13]; for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + const __m128i ax0 = _mm_set_epi64x(ah0, al0); const __m128i ax1 = _mm_set_epi64x(ah1, al1); - __m128i cx0; - __m128i cx1; if (SOFT_AES) { cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); @@ -2064,8 +2063,8 @@ public: cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx0 = _mm_aesenc_si128(cx0, ax0); - cx1 = _mm_aesenc_si128(cx1, ax1); + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) @@ -2077,9 +2076,6 @@ public: idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); - uint64_t hi, lo, cl, ch; cl = ((uint64_t*) &l0[idx0 & MASK])[0]; ch = ((uint64_t*) &l0[idx0 & MASK])[1]; @@ -2103,6 +2099,7 @@ public: bx10 = bx00; bx00 = cx0; + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index a52a28f4..fb2f1217 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -50,29 +50,34 @@ extern "C" #include "crypto/c_skein.h" #ifndef XMRIG_NO_ASM - void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0); - void cn_litev1_mainloop_sandybridge_asm(ScratchPad* ctx0); - void cn_fast_mainloop_sandybridge_asm(ScratchPad* ctx0); - void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0); - void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0); - void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0); - void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); - void cn_fastv2_mainloop_ivybridge_asm(ScratchPad* ctx0); - void cn_fastv2_mainloop_ryzen_asm(ScratchPad* ctx0); - void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0); - void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); - void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0); - void cn_ultralitev2_mainloop_ivybridge_asm(ScratchPad* ctx0); - void cn_ultralitev2_mainloop_ryzen_asm(ScratchPad* ctx0); - void cn_ultralitev2_mainloop_bulldozer_asm(ScratchPad* ctx0); - void cn_ultralitev2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); - void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); - void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); - void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); - void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); - void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); - void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); - void cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx); + void cnv1_main_loop_sandybridge_asm(ScratchPad* ctx0); + void cnv1_main_loop_lite_sandybridge_asm(ScratchPad* ctx0); + void cnv1_main_loop_fast_sandybridge_asm(ScratchPad* ctx0); + void cnv1_main_loop_upx_sandybridge_asm(ScratchPad* ctx0); + + void cnv2_main_loop_ivybridge_asm(ScratchPad* ctx0); + void cnv2_main_loop_ryzen_asm(ScratchPad* ctx0); + void cnv2_main_loop_bulldozer_asm(ScratchPad* ctx0); + void cnv2_double_main_loop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + + void cnv2_main_loop_fastv2_ivybridge_asm(ScratchPad* ctx0); + void cnv2_main_loop_fastv2_ryzen_asm(ScratchPad* ctx0); + void cnv2_main_loop_fastv2_bulldozer_asm(ScratchPad* ctx0); + void cnv2_double_main_loop_fastv2_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + + void cnv2_main_loop_ultralite_ivybridge_asm(ScratchPad* ctx0); + void cnv2_main_loop_ultralite_ryzen_asm(ScratchPad* ctx0); + void cnv2_main_loop_ultralite_bulldozer_asm(ScratchPad* ctx0); + void cnv2_double_main_loop_ultralite_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); + + void cnv1_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cnv1_main_loop_lite_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cnv1_main_loop_fast_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cnv1_main_loop_upx_soft_aes_sandybridge_asm(ScratchPad* ctx0); + + void cnv2_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(ScratchPad* ctx0); + void cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(ScratchPad* ctx); #endif } @@ -1437,28 +1442,28 @@ public: if (SOFT_AES) { scratchPad[0]->t_fn = (const uint32_t*)saes_table; - if (ITERATIONS == 0x80000) { - cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); - } else if (ITERATIONS == 0x40000) { + if (ITERATIONS == 0x40000) { if (MASK == 0x1FFFF0) { - cn_fast_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + cnv1_main_loop_fast_soft_aes_sandybridge_asm(scratchPad[0]); } else { - cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + cnv1_main_loop_lite_soft_aes_sandybridge_asm(scratchPad[0]); } + } else if (ITERATIONS == 0x20000) { + cnv1_main_loop_upx_soft_aes_sandybridge_asm(scratchPad[0]); } else { - cn_liteupx_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + cnv1_main_loop_soft_aes_sandybridge_asm(scratchPad[0]); } } else { - if (ITERATIONS == 0x80000) { - cnv1_mainloop_sandybridge_asm(scratchPad[0]); - } else if (ITERATIONS == 0x40000) { + if (ITERATIONS == 0x40000) { if (MASK == 0x1FFFF0) { - cn_fast_mainloop_sandybridge_asm(scratchPad[0]); + cnv1_main_loop_fast_sandybridge_asm(scratchPad[0]); } else { - cn_litev1_mainloop_sandybridge_asm(scratchPad[0]); + cnv1_main_loop_lite_sandybridge_asm(scratchPad[0]); } + } else if (ITERATIONS == 0x20000) { + cnv1_main_loop_upx_sandybridge_asm(scratchPad[0]); } else { - cn_liteupx_mainloop_sandybridge_asm(scratchPad[0]); + cnv1_main_loop_sandybridge_asm(scratchPad[0]); } } #endif @@ -1560,36 +1565,36 @@ public: scratchPad[0]->input = input; scratchPad[0]->t_fn = (const uint32_t*)saes_table; if (ITERATIONS == 0x40000) { - cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(scratchPad[0]); } else if (ITERATIONS == 0x10000) { - cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(scratchPad[0]); } else { - cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); + cnv2_main_loop_soft_aes_sandybridge_asm(scratchPad[0]); } } else { - if (ITERATIONS == 0x10000) { - cn_ultralitev2_mainloop_ivybridge_asm(scratchPad[0]); - } else if (ITERATIONS == 0x40000) { - cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]); + if (ITERATIONS == 0x40000) { + cnv2_main_loop_fastv2_ivybridge_asm(scratchPad[0]); + } else if (ITERATIONS == 0x10000) { + cnv2_main_loop_ultralite_ivybridge_asm(scratchPad[0]); } else { - cnv2_mainloop_ivybridge_asm(scratchPad[0]); + cnv2_main_loop_ivybridge_asm(scratchPad[0]); } } } else if (asmOptimization == AsmOptimization::ASM_RYZEN) { - if (ITERATIONS == 0x10000) { - cn_ultralitev2_mainloop_ryzen_asm(scratchPad[0]); - } else if (ITERATIONS == 0x40000) { - cn_fastv2_mainloop_ryzen_asm(scratchPad[0]); + if (ITERATIONS == 0x40000) { + cnv2_main_loop_fastv2_ryzen_asm(scratchPad[0]); + } else if (ITERATIONS == 0x10000) { + cnv2_main_loop_ultralite_ryzen_asm(scratchPad[0]); } else { - cnv2_mainloop_ryzen_asm(scratchPad[0]); + cnv2_main_loop_ryzen_asm(scratchPad[0]); } } else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) { - if (ITERATIONS == 0x10000) { - cn_ultralitev2_mainloop_bulldozer_asm(scratchPad[0]); - } else if (ITERATIONS == 0x40000) { - cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]); + if (ITERATIONS == 0x40000) { + cnv2_main_loop_fastv2_bulldozer_asm(scratchPad[0]); + } else if (ITERATIONS == 0x10000) { + cnv2_main_loop_ultralite_bulldozer_asm(scratchPad[0]); } else { - cnv2_mainloop_bulldozer_asm(scratchPad[0]); + cnv2_main_loop_bulldozer_asm(scratchPad[0]); } } #endif @@ -2306,12 +2311,12 @@ public: cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); #ifndef XMRIG_NO_ASM - if (ITERATIONS == 0x10000) { - cn_ultralitev2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); - } else if (ITERATIONS == 0x40000) { - cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); + if (ITERATIONS == 0x40000) { + cnv2_double_main_loop_fastv2_sandybridge_asm(scratchPad[0], scratchPad[1]); + } else if (ITERATIONS == 0x10000) { + cnv2_double_main_loop_ultralite_sandybridge_asm(scratchPad[0], scratchPad[1]); } else { - cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); + cnv2_double_main_loop_sandybridge_asm(scratchPad[0], scratchPad[1]); } #endif diff --git a/src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 9b58cdc7..00000000 --- a/src/crypto/asm/cn_fast_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,166 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 2097136 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 262144 - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cn_fast_mainloop_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 2097136 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 2097136 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cn_fast_mainloop_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc b/src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc deleted file mode 100644 index 8d341665..00000000 --- a/src/crypto/asm/cn_fastv2_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_fast2_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movq r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movq xmm0, rax - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_fast2_bulldozer - shr rdi, 19 - -sqrt_fixup_fast2_bulldozer_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_fast2_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_fast2_bulldozer_endp - -sqrt_fixup_fast2_bulldozer: - movq r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_fast2_bulldozer_ret - -cnv2_main_loop_fast2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/cn_fastv2_main_loop_ryzen.inc b/src/crypto/asm/cn_fastv2_main_loop_ryzen.inc deleted file mode 100644 index e4012d0c..00000000 --- a/src/crypto/asm/cn_fastv2_main_loop_ryzen.inc +++ /dev/null @@ -1,183 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -$main_loop_fast2_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je $sqrt_fixup_fast2_ryzen - shr rdi, 19 - -$sqrt_fixup_fast2_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne $main_loop_fast2_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp $cnv2_main_loop_fast2_ryzen_endp - -$sqrt_fixup_fast2_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp $sqrt_fixup_fast2_ryzen_ret - -$cnv2_main_loop_fast2_ryzen_endp: diff --git a/src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 3324137e..00000000 --- a/src/crypto/asm/cn_fastv2_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,271 +0,0 @@ - mov QWORD PTR [rsp+8], rcx - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 152 - - stmxcsr DWORD PTR [rsp+4] - mov DWORD PTR [rsp], 24448 - ldmxcsr DWORD PTR [rsp] - - mov rax, QWORD PTR [rcx+48] - mov r10, rcx - xor rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r9, QWORD PTR [rcx+40] - xor r9, QWORD PTR [rcx+8] - movq xmm4, rax - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r11, QWORD PTR [rcx+224] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r10+72] - mov rax, QWORD PTR [r10+80] - movq xmm0, rdx - xor rax, QWORD PTR [r10+64] - - movaps XMMWORD PTR [rsp+16], xmm6 - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+48], xmm8 - movaps XMMWORD PTR [rsp+64], xmm9 - movaps XMMWORD PTR [rsp+80], xmm10 - movaps XMMWORD PTR [rsp+96], xmm11 - movaps XMMWORD PTR [rsp+112], xmm12 - movaps XMMWORD PTR [rsp+128], xmm13 - - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - - mov rax, r8 - punpcklqdq xmm4, xmm0 - and eax, 2097136 - movq xmm10, QWORD PTR [r10+96] - movq xmm0, rcx - mov rcx, QWORD PTR [r10+104] - xorps xmm9, xmm9 - mov QWORD PTR [rsp+248], rax - movq xmm12, r11 - mov QWORD PTR [rsp+240], r9 - punpcklqdq xmm5, xmm0 - movq xmm13, rcx - mov r12d, 262144 - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cnv2_mainloop_soft_aes_fast2_sandybridge: - movd xmm11, r12d - mov r12, QWORD PTR [r10+272] - lea r13, QWORD PTR [rax+r11] - mov esi, DWORD PTR [r13] - movq xmm0, r9 - mov r10d, DWORD PTR [r13+4] - movq xmm7, r8 - mov ebp, DWORD PTR [r13+12] - mov r14d, DWORD PTR [r13+8] - mov rdx, QWORD PTR [rsp+248] - movzx ecx, sil - shr esi, 8 - punpcklqdq xmm7, xmm0 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - movd xmm1, r11d - add ebp, 256 - movq r11, xmm12 - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - xor eax, DWORD PTR [r12+rcx*4] - mov rcx, rdx - xor eax, r15d - punpckldq xmm2, xmm1 - xor rcx, 16 - movd xmm6, eax - mov rax, rdx - punpckldq xmm6, xmm0 - xor rax, 32 - punpckldq xmm6, xmm2 - xor rdx, 48 - movdqu xmm2, XMMWORD PTR [rcx+r11] - pxor xmm6, xmm7 - paddq xmm2, xmm4 - movdqu xmm1, XMMWORD PTR [rax+r11] - movdqu xmm0, XMMWORD PTR [rdx+r11] - paddq xmm0, xmm5 - movdqu XMMWORD PTR [rcx+r11], xmm0 - movdqu XMMWORD PTR [rax+r11], xmm2 - movq rcx, xmm13 - paddq xmm1, xmm7 - movdqu XMMWORD PTR [rdx+r11], xmm1 - movq rdi, xmm6 - mov r10, rdi - and r10d, 2097136 - xor edx, edx - mov rax, rcx - shl rax, 32 - movq rbx, xmm10 - xor rbx, rax - lea r9, QWORD PTR [rcx+rcx] - add r9d, edi - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - mov ecx, -2147483647 - movdqu XMMWORD PTR [r13], xmm0 - or r9, rcx - movdqa xmm0, xmm6 - movaps xmm1, xmm9 - psrldq xmm0, 8 - movq rax, xmm0 - xor rbx, QWORD PTR [r10+r11] - lea r14, QWORD PTR [r10+r11] - mov rbp, QWORD PTR [r14+8] - div r9 - shl rdx, 32 - mov eax, eax - add rdx, rax - lea r9, QWORD PTR [rdx+rdi] - movq xmm10, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm1, xmm0 - movq rdx, xmm1 - test rdx, 524287 - je sqrt_fixup_soft_aes_fast2_sandybridge - psrlq xmm1, 19 -sqrt_fixup_soft_aes_fast2_sandybridge_ret: - - mov r9, r10 - movdqa xmm13, xmm1 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - mov rax, rbx - mul rdi - movdqu xmm2, XMMWORD PTR [r9+r11] - movdqu xmm1, XMMWORD PTR [rcx+r11] - paddq xmm1, xmm7 - movq xmm0, rax - movq xmm3, rdx - xor rax, QWORD PTR [r11+rcx+8] - xor rdx, QWORD PTR [rcx+r11] - punpcklqdq xmm3, xmm0 - add r8, rdx - movdqu xmm0, XMMWORD PTR [r10+r11] - pxor xmm2, xmm3 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [r9+r11], xmm0 - movdqa xmm5, xmm4 - mov r9, QWORD PTR [rsp+240] - movdqa xmm4, xmm6 - add r9, rax - movdqu XMMWORD PTR [rcx+r11], xmm2 - movdqu XMMWORD PTR [r10+r11], xmm1 - mov r10, QWORD PTR [rsp+224] - movd r12d, xmm11 - mov QWORD PTR [r14], r8 - xor r8, rbx - mov rax, r8 - mov QWORD PTR [r14+8], r9 - and eax, 2097136 - xor r9, rbp - mov QWORD PTR [rsp+240], r9 - mov QWORD PTR [rsp+248], rax - sub r12d, 1 - jne cnv2_mainloop_soft_aes_fast2_sandybridge - - ldmxcsr DWORD PTR [rsp+4] - movaps xmm6, XMMWORD PTR [rsp+16] - movaps xmm7, XMMWORD PTR [rsp+32] - movaps xmm8, XMMWORD PTR [rsp+48] - movaps xmm9, XMMWORD PTR [rsp+64] - movaps xmm10, XMMWORD PTR [rsp+80] - movaps xmm11, XMMWORD PTR [rsp+96] - movaps xmm12, XMMWORD PTR [rsp+112] - movaps xmm13, XMMWORD PTR [rsp+128] - - add rsp, 152 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp - -sqrt_fixup_soft_aes_fast2_sandybridge: - dec rdx - mov r15d, -1022 - shl r15, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - lea rcx, [rcx+r15+1] - add rax, r15 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm1, rdx - jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret - -cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp: diff --git a/src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc b/src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc deleted file mode 100644 index 4dae0c33..00000000 --- a/src/crypto/asm/cn_liteupx_mainloop_sandybridge.inc +++ /dev/null @@ -1,74 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 131072 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 1048560 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cn_liteupx_mainloop_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 1048560 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 1048560 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cn_liteupx_mainloop_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 880f8b09..00000000 --- a/src/crypto/asm/cn_liteupx_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,166 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 1048560 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 131072 - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cn_liteupx_mainloop_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 1048560 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 1048560 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cn_liteupx_mainloop_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/cn_litev1_mainloop_sandybridge.inc b/src/crypto/asm/cn_litev1_mainloop_sandybridge.inc deleted file mode 100644 index 2842d5fb..00000000 --- a/src/crypto/asm/cn_litev1_mainloop_sandybridge.inc +++ /dev/null @@ -1,74 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 262144 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 1048560 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cn_litev1_mainloop_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 1048560 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 1048560 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cn_litev1_mainloop_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/cn_litev1_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_litev1_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index e38dcd60..00000000 --- a/src/crypto/asm/cn_litev1_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,166 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 1048560 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 262144 - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cn_litev1_mainloop_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 1048560 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 1048560 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cn_litev1_mainloop_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S index 3effbbfe..b2f6bc67 100644 --- a/src/crypto/asm/cn_main_loop.S +++ b/src/crypto/asm/cn_main_loop.S @@ -7,40 +7,44 @@ # define FN_PREFIX(fn) fn .section .text #endif -.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm) -.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) -.global FN_PREFIX(cnv2_mainloop_ryzen_asm) -.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) -.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm) -.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm) -.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm) -.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm) -.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm) -.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm) + +.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm) #ifdef __APPLE__ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cnv1_mainloop_sandybridge_asm): +FN_PREFIX(cnv1_main_loop_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cnv1_mainloop_sandybridge.inc" + #include "cnv1_main_loop_sandybridge.inc" add rsp, 48 ret 0 @@ -49,10 +53,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_litev1_mainloop_sandybridge_asm): +FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_litev1_mainloop_sandybridge.inc" + #include "cnv1_main_loop_lite_sandybridge.inc" add rsp, 48 ret 0 @@ -61,10 +65,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_fast_mainloop_sandybridge_asm): +FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_fast_mainloop_sandybridge.inc" + #include "cnv1_main_loop_fast_sandybridge.inc" add rsp, 48 ret 0 @@ -73,7 +77,19 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cnv2_mainloop_ivybridge_asm): +FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv1_main_loop_upx_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_ivybridge_asm): sub rsp, 48 mov rcx, rdi #include "cnv2_main_loop_ivybridge.inc" @@ -85,7 +101,7 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cnv2_mainloop_ryzen_asm): +FN_PREFIX(cnv2_main_loop_ryzen_asm): sub rsp, 48 mov rcx, rdi #include "cnv2_main_loop_ryzen.inc" @@ -97,7 +113,7 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cnv2_mainloop_bulldozer_asm): +FN_PREFIX(cnv2_main_loop_bulldozer_asm): sub rsp, 48 mov rcx, rdi #include "cnv2_main_loop_bulldozer.inc" @@ -109,7 +125,7 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): +FN_PREFIX(cnv2_double_main_loop_sandybridge_asm): sub rsp, 48 mov rcx, rdi mov rdx, rsi @@ -122,10 +138,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm): +FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_fastv2_main_loop_ivybridge.inc" + #include "cnv2_main_loop_fastv2_ivybridge.inc" add rsp, 48 ret 0 @@ -134,10 +150,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_fastv2_mainloop_ryzen_asm): +FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm): sub rsp, 48 mov rcx, rdi - #include "cn_fastv2_main_loop_ryzen.inc" + #include "cnv2_main_loop_fastv2_ryzen.inc" add rsp, 48 ret 0 @@ -146,10 +162,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm): +FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm): sub rsp, 48 mov rcx, rdi - #include "cn_fastv2_main_loop_bulldozer.inc" + #include "cnv2_main_loop_fastv2_bulldozer.inc" add rsp, 48 ret 0 @@ -158,11 +174,11 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm): +FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm): sub rsp, 48 mov rcx, rdi mov rdx, rsi - #include "cn_fastv2_double_main_loop_sandybridge.inc" + #include "cnv2_double_main_loop_fastv2_sandybridge.inc" add rsp, 48 ret 0 @@ -171,10 +187,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): +FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_liteupx_mainloop_sandybridge.inc" + #include "cnv2_main_loop_ultralite_ivybridge.inc" add rsp, 48 ret 0 @@ -183,10 +199,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm): +FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm): sub rsp, 48 mov rcx, rdi - #include "cn_ultralitev2_main_loop_ivybridge.inc" + #include "cnv2_main_loop_ultralite_ryzen.inc" add rsp, 48 ret 0 @@ -195,11 +211,23 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm): +FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_ultralite_bulldozer.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm): sub rsp, 48 mov rcx, rdi mov rdx, rsi - #include "cn_ultralitev2_double_main_loop_sandybridge.inc" + #include "cnv2_double_main_loop_ultralite_sandybridge.inc" add rsp, 48 ret 0 @@ -208,10 +236,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm): +FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_ultralitev2_main_loop_ryzen.inc" + #include "cnv1_main_loop_soft_aes_sandybridge.inc" add rsp, 48 ret 0 @@ -220,34 +248,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_ultralitev2_main_loop_bulldozer.inc" - add rsp, 48 - ret 0 - -#ifdef __APPLE__ -ALIGN 16 -#else -ALIGN 64 -#endif -FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): - sub rsp, 48 - mov rcx, rdi - #include "cnv1_mainloop_soft_aes_sandybridge.inc" - add rsp, 48 - ret 0 - -#ifdef __APPLE__ -ALIGN 16 -#else -ALIGN 64 -#endif -FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm): +FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_litev1_mainloop_soft_aes_sandybridge.inc" + #include "cnv1_main_loop_lite_soft_aes_sandybridge.inc" add rsp, 48 ret 0 @@ -256,10 +260,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm): +FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_fast_mainloop_soft_aes_sandybridge.inc" + #include "cnv1_main_loop_fast_soft_aes_sandybridge.inc" add rsp, 48 ret 0 @@ -268,34 +272,10 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): - sub rsp, 48 - mov rcx, rdi - #include "cnv2_mainloop_soft_aes_sandybridge.inc" - add rsp, 48 - ret 0 - -#ifdef __APPLE__ -ALIGN 16 -#else -ALIGN 64 -#endif -FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm): - sub rsp, 48 - mov rcx, rdi - #include "cn_fastv2_mainloop_soft_aes_sandybridge.inc" - add rsp, 48 - ret 0 - -#ifdef __APPLE__ -ALIGN 16 -#else -ALIGN 64 -#endif -FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm): +FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_liteupx_mainloop_soft_aes_sandybridge.inc" + #include "cnv1_main_loop_upx_soft_aes_sandybridge.inc" add rsp, 48 ret 0 @@ -304,9 +284,34 @@ ALIGN 16 #else ALIGN 64 #endif -FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm): +FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_fastv2_soft_aes_sandybridge.inc" + add rsp, 48 + ret 0 + + +#ifdef __APPLE__ +ALIGN 16 +#else +ALIGN 64 +#endif +FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm): sub rsp, 48 mov rcx, rdi - #include "cn_ultralitev2_mainloop_soft_aes_sandybridge.inc" + #include "cnv2_main_loop_ultralite_soft_aes_sandybridge.inc" add rsp, 48 ret 0 \ No newline at end of file diff --git a/src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc b/src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc deleted file mode 100644 index 2cc3c4f9..00000000 --- a/src/crypto/asm/cn_ultralitev2_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,414 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 65536 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 131056 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 131056 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -main_loop_double_ultralitev2_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 131056 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 131056 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 131056 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_ultralitev2_sandybridge -div_fix_1_ret_ultralitev2_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_ultralitev2_sandybridge -div_fix_2_ret_ultralitev2_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_ultralitev2_sandybridge -sqrt_fix_1_ret_ultralitev2_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_ultralitev2_sandybridge -sqrt_fix_2_ret_ultralitev2_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 131056 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_ultralitev2_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp - -div_fix_1_ultralitev2_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_ultralitev2_sandybridge - -div_fix_2_ultralitev2_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_ultralitev2_sandybridge - -sqrt_fix_1_ultralitev2_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_ultralitev2_sandybridge - -sqrt_fix_2_ultralitev2_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_ultralitev2_sandybridge - -cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp: diff --git a/src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc b/src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc deleted file mode 100644 index 403cff04..00000000 --- a/src/crypto/asm/cn_ultralitev2_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 65536 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 131056 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_ultralitev2_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movq r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 131056 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movq xmm0, rax - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_ultralitev2_bulldozer - shr rdi, 19 - -sqrt_fixup_ultralitev2_bulldozer_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 131056 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_ultralitev2_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_ultralitev2_bulldozer_endp - -sqrt_fixup_ultralitev2_bulldozer: - movq r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_ultralitev2_bulldozer_ret - -cnv2_main_loop_ultralitev2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc b/src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc deleted file mode 100644 index b0488836..00000000 --- a/src/crypto/asm/cn_ultralitev2_main_loop_ivybridge.inc +++ /dev/null @@ -1,186 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 65536 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 131056 - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -$main_loop_ultralitev2_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 131056 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - movq rdx, xmm3 - test edx, 524287 - je $sqrt_fixup_ultralitev2_ivybridge - psrlq xmm3, 19 -$sqrt_fixup_ultralitev2_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 131056 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne $main_loop_ultralitev2_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp $cnv2_main_loop_ultralitev2_ivybridge_endp - -$sqrt_fixup_ultralitev2_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp $sqrt_fixup_ultralitev2_ivybridge_ret - -$cnv2_main_loop_ultralitev2_ivybridge_endp: diff --git a/src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc b/src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc deleted file mode 100644 index e50ff9e9..00000000 --- a/src/crypto/asm/cn_ultralitev2_main_loop_ryzen.inc +++ /dev/null @@ -1,183 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 65536 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 131056 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -$main_loop_ultralitev2_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 131056 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je $sqrt_fixup_ultralitev2_ryzen - shr rdi, 19 - -$sqrt_fixup_ultralitev2_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 131056 - movdqa xmm3, xmm5 - dec ebp - jne $main_loop_ultralitev2_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp $cnv2_main_loop_ultralitev2_ryzen_endp - -$sqrt_fixup_ultralitev2_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp $sqrt_fixup_ultralitev2_ryzen_ret - -$cnv2_main_loop_ultralitev2_ryzen_endp: diff --git a/src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 8ab8a060..00000000 --- a/src/crypto/asm/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,271 +0,0 @@ - mov QWORD PTR [rsp+8], rcx - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 152 - - stmxcsr DWORD PTR [rsp+4] - mov DWORD PTR [rsp], 24448 - ldmxcsr DWORD PTR [rsp] - - mov rax, QWORD PTR [rcx+48] - mov r10, rcx - xor rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r9, QWORD PTR [rcx+40] - xor r9, QWORD PTR [rcx+8] - movq xmm4, rax - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r11, QWORD PTR [rcx+224] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r10+72] - mov rax, QWORD PTR [r10+80] - movq xmm0, rdx - xor rax, QWORD PTR [r10+64] - - movaps XMMWORD PTR [rsp+16], xmm6 - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+48], xmm8 - movaps XMMWORD PTR [rsp+64], xmm9 - movaps XMMWORD PTR [rsp+80], xmm10 - movaps XMMWORD PTR [rsp+96], xmm11 - movaps XMMWORD PTR [rsp+112], xmm12 - movaps XMMWORD PTR [rsp+128], xmm13 - - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - - mov rax, r8 - punpcklqdq xmm4, xmm0 - and eax, 131056 - movq xmm10, QWORD PTR [r10+96] - movq xmm0, rcx - mov rcx, QWORD PTR [r10+104] - xorps xmm9, xmm9 - mov QWORD PTR [rsp+248], rax - movq xmm12, r11 - mov QWORD PTR [rsp+240], r9 - punpcklqdq xmm5, xmm0 - movq xmm13, rcx - mov r12d, 262144 - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cnv2_mainloop_soft_aes_ultralitev2_sandybridge: - movd xmm11, r12d - mov r12, QWORD PTR [r10+272] - lea r13, QWORD PTR [rax+r11] - mov esi, DWORD PTR [r13] - movq xmm0, r9 - mov r10d, DWORD PTR [r13+4] - movq xmm7, r8 - mov ebp, DWORD PTR [r13+12] - mov r14d, DWORD PTR [r13+8] - mov rdx, QWORD PTR [rsp+248] - movzx ecx, sil - shr esi, 8 - punpcklqdq xmm7, xmm0 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - movd xmm1, r11d - add ebp, 256 - movq r11, xmm12 - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - xor eax, DWORD PTR [r12+rcx*4] - mov rcx, rdx - xor eax, r15d - punpckldq xmm2, xmm1 - xor rcx, 16 - movd xmm6, eax - mov rax, rdx - punpckldq xmm6, xmm0 - xor rax, 32 - punpckldq xmm6, xmm2 - xor rdx, 48 - movdqu xmm2, XMMWORD PTR [rcx+r11] - pxor xmm6, xmm7 - paddq xmm2, xmm4 - movdqu xmm1, XMMWORD PTR [rax+r11] - movdqu xmm0, XMMWORD PTR [rdx+r11] - paddq xmm0, xmm5 - movdqu XMMWORD PTR [rcx+r11], xmm0 - movdqu XMMWORD PTR [rax+r11], xmm2 - movq rcx, xmm13 - paddq xmm1, xmm7 - movdqu XMMWORD PTR [rdx+r11], xmm1 - movq rdi, xmm6 - mov r10, rdi - and r10d, 131056 - xor edx, edx - mov rax, rcx - shl rax, 32 - movq rbx, xmm10 - xor rbx, rax - lea r9, QWORD PTR [rcx+rcx] - add r9d, edi - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - mov ecx, -2147483647 - movdqu XMMWORD PTR [r13], xmm0 - or r9, rcx - movdqa xmm0, xmm6 - movaps xmm1, xmm9 - psrldq xmm0, 8 - movq rax, xmm0 - xor rbx, QWORD PTR [r10+r11] - lea r14, QWORD PTR [r10+r11] - mov rbp, QWORD PTR [r14+8] - div r9 - shl rdx, 32 - mov eax, eax - add rdx, rax - lea r9, QWORD PTR [rdx+rdi] - movq xmm10, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm1, xmm0 - movq rdx, xmm1 - test rdx, 524287 - je sqrt_fixup_soft_aes_ultralitev2_sandybridge - psrlq xmm1, 19 -sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret: - - mov r9, r10 - movdqa xmm13, xmm1 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - mov rax, rbx - mul rdi - movdqu xmm2, XMMWORD PTR [r9+r11] - movdqu xmm1, XMMWORD PTR [rcx+r11] - paddq xmm1, xmm7 - movq xmm0, rax - movq xmm3, rdx - xor rax, QWORD PTR [r11+rcx+8] - xor rdx, QWORD PTR [rcx+r11] - punpcklqdq xmm3, xmm0 - add r8, rdx - movdqu xmm0, XMMWORD PTR [r10+r11] - pxor xmm2, xmm3 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [r9+r11], xmm0 - movdqa xmm5, xmm4 - mov r9, QWORD PTR [rsp+240] - movdqa xmm4, xmm6 - add r9, rax - movdqu XMMWORD PTR [rcx+r11], xmm2 - movdqu XMMWORD PTR [r10+r11], xmm1 - mov r10, QWORD PTR [rsp+224] - movd r12d, xmm11 - mov QWORD PTR [r14], r8 - xor r8, rbx - mov rax, r8 - mov QWORD PTR [r14+8], r9 - and eax, 131056 - xor r9, rbp - mov QWORD PTR [rsp+240], r9 - mov QWORD PTR [rsp+248], rax - sub r12d, 1 - jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge - - ldmxcsr DWORD PTR [rsp+4] - movaps xmm6, XMMWORD PTR [rsp+16] - movaps xmm7, XMMWORD PTR [rsp+32] - movaps xmm8, XMMWORD PTR [rsp+48] - movaps xmm9, XMMWORD PTR [rsp+64] - movaps xmm10, XMMWORD PTR [rsp+80] - movaps xmm11, XMMWORD PTR [rsp+96] - movaps xmm12, XMMWORD PTR [rsp+112] - movaps xmm13, XMMWORD PTR [rsp+128] - - add rsp, 152 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp - -sqrt_fixup_soft_aes_ultralitev2_sandybridge: - dec rdx - mov r15d, -1022 - shl r15, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - lea rcx, [rcx+r15+1] - add rax, r15 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm1, rdx - jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret - -cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp: diff --git a/src/crypto/asm/cn_fast_mainloop_sandybridge.inc b/src/crypto/asm/cnv1_main_loop_sandybridge.inc.in similarity index 90% rename from src/crypto/asm/cn_fast_mainloop_sandybridge.inc rename to src/crypto/asm/cnv1_main_loop_sandybridge.inc.in index 8a3bdaf7..6619b23e 100644 --- a/src/crypto/asm/cn_fast_mainloop_sandybridge.inc +++ b/src/crypto/asm/cnv1_main_loop_sandybridge.inc.in @@ -5,7 +5,7 @@ push r14 push r15 mov rax, QWORD PTR [rcx+48] - mov ebp, 262144 + mov ebp, ${ITERATIONS} xor rax, QWORD PTR [rcx+16] mov rdx, QWORD PTR [rcx+56] xor rdx, QWORD PTR [rcx+24] @@ -18,7 +18,7 @@ xor rdi, QWORD PTR [rcx+8] mov rdx, r8 mov r15, QWORD PTR [rcx+264] - and edx, 2097136 + and edx, ${MASK} mov r14, QWORD PTR [rax+35] xor r14, QWORD PTR [rcx+192] mov rsi, QWORD PTR [rcx+224] @@ -30,14 +30,14 @@ #else ALIGN 64 #endif -cn_fast_mainloop_sandybridge: +cnv1_main_loop_${ALGO}_sandybridge: movq xmm0, rdi movq xmm1, r8 punpcklqdq xmm1, xmm0 aesenc xmm2, xmm1 movq r10, xmm2 mov r9d, r10d - and r9d, 2097136 + and r9d, ${MASK} add r9, rsi movdqa xmm0, xmm2 pxor xmm0, xmm3 @@ -60,11 +60,11 @@ cn_fast_mainloop_sandybridge: mov QWORD PTR [r9+8], rax xor r8, rbx mov rdx, r8 - and edx, 2097136 + and edx, ${MASK} movdqu xmm2, XMMWORD PTR [rdx+rsi] xor rdi, r11 dec ebp - jne cn_fast_mainloop_sandybridge + jne cnv1_main_loop_${ALGO}_sandybridge mov rbx, QWORD PTR [rsp+24] mov rbp, QWORD PTR [rsp+32] diff --git a/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in similarity index 94% rename from src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc rename to src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in index 5a28185e..c8d8ea59 100644 --- a/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc +++ b/src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in @@ -26,7 +26,7 @@ xor r13, QWORD PTR [rcx+8] mov rdx, r8 mov rdi, QWORD PTR [rcx+224] - and edx, 2097136 + and edx, ${MASK} mov rax, QWORD PTR [rax+35] xor rax, QWORD PTR [rcx+192] movq xmm5, rax @@ -38,14 +38,14 @@ mov rax, QWORD PTR [rcx+264] movq xmm7, rax - mov eax, 524288 + mov eax, ${ITERATIONS} #ifdef __APPLE__ ALIGN 16 #else ALIGN 64 #endif -cnv1_mainloop_soft_aes_sandybridge: +cnv1_main_loop_${ALGO}_soft_aes_sandybridge: movq xmm9, rax mov r12, QWORD PTR [rcx+272] mov esi, DWORD PTR [rdx+rdi] @@ -118,7 +118,7 @@ cnv1_mainloop_soft_aes_sandybridge: pxor xmm3, xmm1 movq r9, xmm3 mov r10d, r9d - and r10d, 2097136 + and r10d, ${MASK} movdqa xmm0, xmm3 pxor xmm0, xmm4 movdqu XMMWORD PTR [rdx+rdi], xmm0 @@ -145,10 +145,10 @@ cnv1_mainloop_soft_aes_sandybridge: movq rax, xmm9 mov rdx, r8 xor r13, r11 - and edx, 2097136 + and edx, ${MASK} mov QWORD PTR [rsp+64], rdx sub eax, 1 - jne cnv1_mainloop_soft_aes_sandybridge + jne cnv1_main_loop_${ALGO}_soft_aes_sandybridge movaps xmm6, XMMWORD PTR [rsp] movaps xmm7, XMMWORD PTR [rsp+16] diff --git a/src/crypto/asm/cnv1_mainloop_sandybridge.inc b/src/crypto/asm/cnv1_mainloop_sandybridge.inc deleted file mode 100644 index 89cc15e8..00000000 --- a/src/crypto/asm/cnv1_mainloop_sandybridge.inc +++ /dev/null @@ -1,74 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 524288 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 2097136 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -cnv1_mainloop_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 2097136 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 2097136 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cnv1_mainloop_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc deleted file mode 100644 index 1ea871f3..00000000 --- a/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,414 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 524288 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -main_loop_double_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_sandybridge -div_fix_1_ret_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_sandybridge -div_fix_2_ret_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_sandybridge -sqrt_fix_1_ret_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_sandybridge -sqrt_fix_2_ret_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_sandybridge_endp - -div_fix_1_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_sandybridge - -div_fix_2_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_sandybridge - -sqrt_fix_1_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_sandybridge - -sqrt_fix_2_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_sandybridge - -cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc b/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in similarity index 90% rename from src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc rename to src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in index 72ab414d..bdf86ea7 100644 --- a/src/crypto/asm/cn_fastv2_double_main_loop_sandybridge.inc +++ b/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in @@ -18,7 +18,7 @@ mov r10, QWORD PTR [rcx+32] mov r8, rcx xor r10, QWORD PTR [rcx] - mov r14d, 262144 + mov r14d, 524288 mov r11, QWORD PTR [rcx+40] xor r11, QWORD PTR [rcx+8] mov rsi, QWORD PTR [rdx+224] @@ -99,7 +99,7 @@ #else ALIGN 64 #endif -main_loop_double_fast2_sandybridge: +cnv2_double_main_loop_${ALGO}_sandybridge: movdqu xmm9, xmm15 mov eax, edx mov ebx, edx @@ -253,8 +253,8 @@ main_loop_double_fast2_sandybridge: mov rbx, rax imul rax, rdx sub r11, rax - js div_fix_1_fast2_sandybridge -div_fix_1_ret_fast2_sandybridge: + js div_fix_1_${ALGO}_sandybridge +div_fix_1_ret_${ALGO}_sandybridge: cvttsd2si rdx, xmm2 mov rax, rdx @@ -262,8 +262,8 @@ div_fix_1_ret_fast2_sandybridge: movd xmm2, r11d movd xmm4, ebx sub r8, rax - js div_fix_2_fast2_sandybridge -div_fix_2_ret_fast2_sandybridge: + js div_fix_2_${ALGO}_sandybridge +div_fix_2_ret_${ALGO}_sandybridge: movd xmm1, r8d movd xmm0, edx @@ -279,15 +279,15 @@ div_fix_2_ret_fast2_sandybridge: movdqa xmm5, xmm1 psrlq xmm5, 19 test r9, 524287 - je sqrt_fix_1_fast2_sandybridge -sqrt_fix_1_ret_fast2_sandybridge: + je sqrt_fix_1_${ALGO}_sandybridge +sqrt_fix_1_ret_${ALGO}_sandybridge: movq r9, xmm10 psrldq xmm1, 8 movq r8, xmm1 test r8, 524287 - je sqrt_fix_2_fast2_sandybridge -sqrt_fix_2_ret_fast2_sandybridge: + je sqrt_fix_2_${ALGO}_sandybridge +sqrt_fix_2_ret_${ALGO}_sandybridge: mov r12d, ecx mov r8d, ecx @@ -335,7 +335,7 @@ sqrt_fix_2_ret_fast2_sandybridge: movdqa xmm6, xmm10 mov r9, r15 dec r14d - jne main_loop_double_fast2_sandybridge + jne cnv2_double_main_loop_${ALGO}_sandybridge ldmxcsr DWORD PTR [rsp+272] movaps xmm13, XMMWORD PTR [rsp+48] @@ -358,19 +358,19 @@ sqrt_fix_2_ret_fast2_sandybridge: pop rsi pop rbp pop rbx - jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp + jmp cnv2_double_main_loop_${ALGO}_sandybridge_endp -div_fix_1_fast2_sandybridge: +div_fix_1_${ALGO}_sandybridge: dec rbx add r11, rdx - jmp div_fix_1_ret_fast2_sandybridge + jmp div_fix_1_ret_${ALGO}_sandybridge -div_fix_2_fast2_sandybridge: +div_fix_2_${ALGO}_sandybridge: dec rdx add r8, r9 - jmp div_fix_2_ret_fast2_sandybridge + jmp div_fix_2_ret_${ALGO}_sandybridge -sqrt_fix_1_fast2_sandybridge: +sqrt_fix_1_${ALGO}_sandybridge: movq r8, xmm3 movdqa xmm0, xmm5 psrldq xmm0, 8 @@ -389,9 +389,9 @@ sqrt_fix_1_fast2_sandybridge: adc r9, 0 movq xmm5, r9 punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_fast2_sandybridge + jmp sqrt_fix_1_ret_${ALGO}_sandybridge -sqrt_fix_2_fast2_sandybridge: +sqrt_fix_2_${ALGO}_sandybridge: psrldq xmm3, 8 movq r11, xmm3 dec r8 @@ -409,6 +409,6 @@ sqrt_fix_2_fast2_sandybridge: adc r8, 0 movq xmm0, r8 punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_fast2_sandybridge + jmp sqrt_fix_2_ret_${ALGO}_sandybridge -cnv2_double_mainloop_asm_fast2_sandybridge_endp: +cnv2_double_main_loop_${ALGO}_sandybridge_endp: diff --git a/src/crypto/asm/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/cnv2_main_loop_bulldozer.inc.in similarity index 89% rename from src/crypto/asm/cnv2_main_loop_bulldozer.inc rename to src/crypto/asm/cnv2_main_loop_bulldozer.inc.in index 1b2de354..f849f3ca 100644 --- a/src/crypto/asm/cnv2_main_loop_bulldozer.inc +++ b/src/crypto/asm/cnv2_main_loop_bulldozer.inc.in @@ -15,7 +15,7 @@ mov rax, QWORD PTR [rcx+48] mov r9, rcx xor rax, QWORD PTR [rcx+16] - mov ebp, 524288 + mov ebp, ${ITERATIONS} mov r8, QWORD PTR [rcx+32] xor r8, QWORD PTR [rcx] mov r11, QWORD PTR [rcx+40] @@ -31,7 +31,7 @@ mov rcx, QWORD PTR [rcx+88] xor rcx, QWORD PTR [r9+72] mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 + and r10d, ${MASK} movaps XMMWORD PTR [rsp+48], xmm6 movq xmm4, rax movaps XMMWORD PTR [rsp+32], xmm7 @@ -46,7 +46,7 @@ punpcklqdq xmm4, xmm0 ALIGN 16 -cnv2_main_loop_bulldozer: +cnv2_main_loop_${ALGO}_bulldozer: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm6, r8 pinsrq xmm6, r11, 1 @@ -83,7 +83,7 @@ cnv2_main_loop_bulldozer: movdqa xmm0, xmm5 pxor xmm0, xmm3 mov r10, r14 - and r10d, 2097136 + and r10d, ${MASK} movdqa XMMWORD PTR [rdx], xmm0 xor rsi, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx] @@ -103,10 +103,10 @@ cnv2_main_loop_bulldozer: sqrtsd xmm1, xmm0 movq rdi, xmm1 test rdi, 524287 - je sqrt_fixup_bulldozer + je sqrt_fixup_${ALGO}_bulldozer shr rdi, 19 -sqrt_fixup_bulldozer_ret: +sqrt_fixup_${ALGO}_bulldozer_ret: mov rax, rsi mul r14 movq xmm1, rax @@ -138,10 +138,10 @@ sqrt_fixup_bulldozer_ret: mov QWORD PTR [r12+8], r11 mov r10, r8 xor r11, r13 - and r10d, 2097136 + and r10d, ${MASK} movdqa xmm3, xmm5 dec ebp - jne cnv2_main_loop_bulldozer + jne cnv2_main_loop_${ALGO}_bulldozer ldmxcsr DWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp+48] @@ -157,9 +157,9 @@ sqrt_fixup_bulldozer_ret: pop r13 pop r12 pop rdi - jmp cnv2_main_loop_bulldozer_endp + jmp cnv2_main_loop_${ALGO}_bulldozer_endp -sqrt_fixup_bulldozer: +sqrt_fixup_${ALGO}_bulldozer: movq r9, xmm5 add r9, r15 dec rdi @@ -175,6 +175,6 @@ sqrt_fixup_bulldozer: imul rcx, rax sub rcx, r9 adc rdi, 0 - jmp sqrt_fixup_bulldozer_ret + jmp sqrt_fixup_${ALGO}_bulldozer_ret -cnv2_main_loop_bulldozer_endp: \ No newline at end of file +cnv2_main_loop_${ALGO}_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/cnv2_main_loop_ivybridge.inc deleted file mode 100644 index 35ee0627..00000000 --- a/src/crypto/asm/cnv2_main_loop_ivybridge.inc +++ /dev/null @@ -1,186 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 524288 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - #ifdef __APPLE__ - ALIGN 16 - #else - ALIGN 64 - #endif -$main_loop_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - movq rdx, xmm3 - test edx, 524287 - je $sqrt_fixup_ivybridge - psrlq xmm3, 19 -$sqrt_fixup_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne $main_loop_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp $cnv2_main_loop_ivybridge_endp - -$sqrt_fixup_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp $sqrt_fixup_ivybridge_ret - -$cnv2_main_loop_ivybridge_endp: diff --git a/src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc b/src/crypto/asm/cnv2_main_loop_ivybridge.inc.in similarity index 90% rename from src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc rename to src/crypto/asm/cnv2_main_loop_ivybridge.inc.in index 8dd92f3b..012bed12 100644 --- a/src/crypto/asm/cn_fastv2_main_loop_ivybridge.inc +++ b/src/crypto/asm/cnv2_main_loop_ivybridge.inc.in @@ -15,7 +15,7 @@ mov rax, QWORD PTR [rcx+48] mov r9, rcx xor rax, QWORD PTR [rcx+16] - mov esi, 262144 + mov esi, ${ITERATIONS} mov r8, QWORD PTR [rcx+32] mov r13d, -2147483647 xor r8, QWORD PTR [rcx] @@ -35,7 +35,7 @@ movaps XMMWORD PTR [rsp+64], xmm6 movaps XMMWORD PTR [rsp+48], xmm7 movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 + and r10d, ${MASK} movq xmm5, rax mov ax, 1023 @@ -52,7 +52,7 @@ #else ALIGN 64 #endif -$main_loop_fast2_ivybridge: +cnv2_main_loop_${ALGO}_ivybridge: lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d mov eax, r10d @@ -66,7 +66,7 @@ $main_loop_fast2_ivybridge: aesenc xmm6, xmm7 movq rbp, xmm6 mov r9, rbp - and r9d, 2097136 + and r9d, ${MASK} movdqu xmm2, XMMWORD PTR [rcx+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm0, XMMWORD PTR [r10+rbx] @@ -109,9 +109,9 @@ $main_loop_fast2_ivybridge: sqrtsd xmm3, xmm0 movq rdx, xmm3 test edx, 524287 - je $sqrt_fixup_fast2_ivybridge + je sqrt_fixup_${ALGO}_ivybridge psrlq xmm3, 19 -$sqrt_fixup_fast2_ivybridge_ret: +sqrt_fixup_${ALGO}_ivybridge_ret: mov ecx, r10d mov rax, rdi @@ -122,7 +122,7 @@ $sqrt_fixup_fast2_ivybridge_ret: mov QWORD PTR [r14], r8 xor r8, rdi mov edi, r8d - and edi, 2097136 + and edi, ${MASK} movq xmm0, rax xor rax, [rcx+rbx+8] add r11, rax @@ -147,7 +147,7 @@ $sqrt_fixup_fast2_ivybridge_ret: mov r10d, edi xor r11, r12 dec rsi - jne $main_loop_fast2_ivybridge + jne cnv2_main_loop_${ALGO}_ivybridge ldmxcsr DWORD PTR [rsp] mov rbx, QWORD PTR [rsp+160] @@ -162,9 +162,9 @@ $sqrt_fixup_fast2_ivybridge_ret: pop rdi pop rsi pop rbp - jmp $cnv2_main_loop_fast2_ivybridge_endp + jmp cnv2_main_loop_${ALGO}_ivybridge_endp -$sqrt_fixup_fast2_ivybridge: +sqrt_fixup_${ALGO}_ivybridge: dec rdx mov r13d, -1022 shl r13, 32 @@ -181,6 +181,6 @@ $sqrt_fixup_fast2_ivybridge: sub rcx, r9 adc rdx, 0 movq xmm3, rdx - jmp $sqrt_fixup_fast2_ivybridge_ret + jmp sqrt_fixup_${ALGO}_ivybridge_ret -$cnv2_main_loop_fast2_ivybridge_endp: +cnv2_main_loop_${ALGO}_ivybridge_endp: diff --git a/src/crypto/asm/cnv2_main_loop_ryzen.inc b/src/crypto/asm/cnv2_main_loop_ryzen.inc.in similarity index 90% rename from src/crypto/asm/cnv2_main_loop_ryzen.inc rename to src/crypto/asm/cnv2_main_loop_ryzen.inc.in index 42054413..6d71264f 100644 --- a/src/crypto/asm/cnv2_main_loop_ryzen.inc +++ b/src/crypto/asm/cnv2_main_loop_ryzen.inc.in @@ -15,7 +15,7 @@ mov rax, QWORD PTR [rcx+48] mov r9, rcx xor rax, QWORD PTR [rcx+16] - mov ebp, 524288 + mov ebp, ${ITERATIONS} mov r8, QWORD PTR [rcx+32] xor r8, QWORD PTR [rcx] mov r11, QWORD PTR [rcx+40] @@ -31,7 +31,7 @@ mov rcx, QWORD PTR [rcx+88] xor rcx, QWORD PTR [r9+72] mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 + and r10d, ${MASK} movaps XMMWORD PTR [rsp+48], xmm6 movq xmm4, rax movaps XMMWORD PTR [rsp+32], xmm7 @@ -50,7 +50,7 @@ #else ALIGN 64 #endif -$main_loop_ryzen: +cnv2_main_loop_${ALGO}_ryzen: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm0, r11 movq xmm6, r8 @@ -82,7 +82,7 @@ $main_loop_ryzen: movdqa xmm0, xmm5 pxor xmm0, xmm3 mov r10, r14 - and r10d, 2097136 + and r10d, ${MASK} movdqa XMMWORD PTR [rdx], xmm0 xor rsi, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx] @@ -107,10 +107,10 @@ $main_loop_ryzen: sqrtsd xmm1, xmm0 movq rdi, xmm1 test rdi, 524287 - je $sqrt_fixup_ryzen + je sqrt_fixup_${ALGO}_ryzen shr rdi, 19 -$sqrt_fixup_ryzen_ret: +sqrt_fixup_${ALGO}_ryzen_ret: mov rax, rsi mul r14 movq xmm1, rax @@ -142,10 +142,10 @@ $sqrt_fixup_ryzen_ret: mov QWORD PTR [r12+8], r11 mov r10, r8 xor r11, r13 - and r10d, 2097136 + and r10d, ${MASK} movdqa xmm3, xmm5 dec ebp - jne $main_loop_ryzen + jne cnv2_main_loop_${ALGO}_ryzen ldmxcsr DWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp+48] @@ -161,9 +161,9 @@ $sqrt_fixup_ryzen_ret: pop r13 pop r12 pop rdi - jmp $cnv2_main_loop_ryzen_endp + jmp cnv2_main_loop_${ALGO}_ryzen_endp -$sqrt_fixup_ryzen: +sqrt_fixup_${ALGO}_ryzen: movq r9, xmm2 dec rdi mov edx, -1022 @@ -178,6 +178,6 @@ $sqrt_fixup_ryzen: imul rcx, rax sub rcx, r9 adc rdi, 0 - jmp $sqrt_fixup_ryzen_ret + jmp sqrt_fixup_${ALGO}_ryzen_ret -$cnv2_main_loop_ryzen_endp: +cnv2_main_loop_${ALGO}_ryzen_endp: diff --git a/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in similarity index 91% rename from src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc rename to src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in index bc3da761..46a58fb8 100644 --- a/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc +++ b/src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in @@ -47,7 +47,7 @@ mov rax, r8 punpcklqdq xmm4, xmm0 - and eax, 2097136 + and eax, ${MASK} movq xmm10, QWORD PTR [r10+96] movq xmm0, rcx mov rcx, QWORD PTR [r10+104] @@ -57,14 +57,14 @@ mov QWORD PTR [rsp+240], r9 punpcklqdq xmm5, xmm0 movq xmm13, rcx - mov r12d, 524288 + mov r12d, ${ITERATIONS} #ifdef __APPLE__ ALIGN 16 #else ALIGN 64 #endif -cnv2_mainloop_soft_aes_sandybridge: +cnv2_main_loop_${ALGO}_soft_aes_sandybridge: movd xmm11, r12d mov r12, QWORD PTR [r10+272] lea r13, QWORD PTR [rax+r11] @@ -152,7 +152,7 @@ cnv2_mainloop_soft_aes_sandybridge: movdqu XMMWORD PTR [rdx+r11], xmm1 movq rdi, xmm6 mov r10, rdi - and r10d, 2097136 + and r10d, ${MASK} xor edx, edx mov rax, rcx shl rax, 32 @@ -185,9 +185,9 @@ cnv2_mainloop_soft_aes_sandybridge: sqrtsd xmm1, xmm0 movq rdx, xmm1 test rdx, 524287 - je sqrt_fixup_soft_aes_sandybridge + je sqrt_fixup_${ALGO}_soft_aes_sandybridge psrlq xmm1, 19 -sqrt_fixup_soft_aes_sandybridge_ret: +sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret: mov r9, r10 movdqa xmm13, xmm1 @@ -223,12 +223,12 @@ sqrt_fixup_soft_aes_sandybridge_ret: xor r8, rbx mov rax, r8 mov QWORD PTR [r14+8], r9 - and eax, 2097136 + and eax, ${MASK} xor r9, rbp mov QWORD PTR [rsp+240], r9 mov QWORD PTR [rsp+248], rax sub r12d, 1 - jne cnv2_mainloop_soft_aes_sandybridge + jne cnv2_main_loop_${ALGO}_soft_aes_sandybridge ldmxcsr DWORD PTR [rsp+4] movaps xmm6, XMMWORD PTR [rsp+16] @@ -249,9 +249,9 @@ sqrt_fixup_soft_aes_sandybridge_ret: pop rsi pop rbp pop rbx - jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp + jmp cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp -sqrt_fixup_soft_aes_sandybridge: +sqrt_fixup_${ALGO}_soft_aes_sandybridge: dec rdx mov r15d, -1022 shl r15, 32 @@ -266,6 +266,6 @@ sqrt_fixup_soft_aes_sandybridge: sub rcx, r9 adc rdx, 0 movq xmm1, rdx - jmp sqrt_fixup_soft_aes_sandybridge_ret + jmp sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret -cnv2_mainloop_soft_aes_sandybridge_asm_endp: +cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp: diff --git a/src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc b/src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc deleted file mode 100644 index 05dbe77b..00000000 --- a/src/crypto/asm/win/cn_fast_mainloop_sandybridge.inc +++ /dev/null @@ -1,70 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 262144 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 2097136 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - ALIGN 64 -cn_fast_mainloop_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 2097136 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 2097136 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cn_fast_mainloop_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 5416be9d..00000000 --- a/src/crypto/asm/win/cn_fast_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,162 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 2097136 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 262144 - - ALIGN 64 -cn_fast_mainloop_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 2097136 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 2097136 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cn_fast_mainloop_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc b/src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc deleted file mode 100644 index d712e10e..00000000 --- a/src/crypto/asm/win/cn_fastv2_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 262144 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 2097136 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 2097136 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 64 -main_loop_double_fast2_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 2097136 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 2097136 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_fast2_sandybridge -div_fix_1_ret_fast2_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_fast2_sandybridge -div_fix_2_ret_fast2_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_fast2_sandybridge -sqrt_fix_1_ret_fast2_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_fast2_sandybridge -sqrt_fix_2_ret_fast2_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 2097136 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_fast2_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp - -div_fix_1_fast2_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_fast2_sandybridge - -div_fix_2_fast2_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_fast2_sandybridge - -sqrt_fix_1_fast2_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_fast2_sandybridge - -sqrt_fix_2_fast2_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_fast2_sandybridge - -cnv2_double_mainloop_asm_fast2_sandybridge_endp: diff --git a/src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc b/src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc deleted file mode 100644 index a73752fc..00000000 --- a/src/crypto/asm/win/cn_fastv2_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_fast2_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movd r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movd xmm0, rax - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_fast2_bulldozer - shr rdi, 19 - -sqrt_fixup_fast2_bulldozer_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_fast2_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_fast2_bulldozer_endp - -sqrt_fixup_fast2_bulldozer: - movd r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_fast2_bulldozer_ret - -cnv2_main_loop_fast2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc b/src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc deleted file mode 100644 index bc3d592c..00000000 --- a/src/crypto/asm/win/cn_fastv2_main_loop_ivybridge.inc +++ /dev/null @@ -1,182 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 262144 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 64 -$main_loop_fast2_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 2097136 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - movq rdx, xmm3 - test edx, 524287 - je $sqrt_fixup_fast2_ivybridge - psrlq xmm3, 19 -$sqrt_fixup_fast2_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 2097136 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne $main_loop_fast2_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp $cnv2_main_loop_fast2_ivybridge_endp - -$sqrt_fixup_fast2_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp $sqrt_fixup_fast2_ivybridge_ret - -$cnv2_main_loop_fast2_ivybridge_endp: diff --git a/src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc b/src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc deleted file mode 100644 index 2bf76e19..00000000 --- a/src/crypto/asm/win/cn_fastv2_main_loop_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 262144 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 64 -$main_loop_fast2_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 2097136 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je $sqrt_fixup_fast2_ryzen - shr rdi, 19 - -$sqrt_fixup_fast2_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 2097136 - movdqa xmm3, xmm5 - dec ebp - jne $main_loop_fast2_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp $cnv2_main_loop_fast2_ryzen_endp - -$sqrt_fixup_fast2_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp $sqrt_fixup_fast2_ryzen_ret - -$cnv2_main_loop_fast2_ryzen_endp: diff --git a/src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 2e678c04..00000000 --- a/src/crypto/asm/win/cn_fastv2_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,267 +0,0 @@ - mov QWORD PTR [rsp+8], rcx - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 152 - - stmxcsr DWORD PTR [rsp+4] - mov DWORD PTR [rsp], 24448 - ldmxcsr DWORD PTR [rsp] - - mov rax, QWORD PTR [rcx+48] - mov r10, rcx - xor rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r9, QWORD PTR [rcx+40] - xor r9, QWORD PTR [rcx+8] - movq xmm4, rax - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r11, QWORD PTR [rcx+224] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r10+72] - mov rax, QWORD PTR [r10+80] - movq xmm0, rdx - xor rax, QWORD PTR [r10+64] - - movaps XMMWORD PTR [rsp+16], xmm6 - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+48], xmm8 - movaps XMMWORD PTR [rsp+64], xmm9 - movaps XMMWORD PTR [rsp+80], xmm10 - movaps XMMWORD PTR [rsp+96], xmm11 - movaps XMMWORD PTR [rsp+112], xmm12 - movaps XMMWORD PTR [rsp+128], xmm13 - - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - - mov rax, r8 - punpcklqdq xmm4, xmm0 - and eax, 2097136 - movq xmm10, QWORD PTR [r10+96] - movq xmm0, rcx - mov rcx, QWORD PTR [r10+104] - xorps xmm9, xmm9 - mov QWORD PTR [rsp+248], rax - movq xmm12, r11 - mov QWORD PTR [rsp+240], r9 - punpcklqdq xmm5, xmm0 - movq xmm13, rcx - mov r12d, 262144 - - ALIGN 64 -cnv2_mainloop_soft_aes_fast2_sandybridge: - movd xmm11, r12d - mov r12, QWORD PTR [r10+272] - lea r13, QWORD PTR [rax+r11] - mov esi, DWORD PTR [r13] - movq xmm0, r9 - mov r10d, DWORD PTR [r13+4] - movq xmm7, r8 - mov ebp, DWORD PTR [r13+12] - mov r14d, DWORD PTR [r13+8] - mov rdx, QWORD PTR [rsp+248] - movzx ecx, sil - shr esi, 8 - punpcklqdq xmm7, xmm0 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - movd xmm1, r11d - add ebp, 256 - movq r11, xmm12 - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - xor eax, DWORD PTR [r12+rcx*4] - mov rcx, rdx - xor eax, r15d - punpckldq xmm2, xmm1 - xor rcx, 16 - movd xmm6, eax - mov rax, rdx - punpckldq xmm6, xmm0 - xor rax, 32 - punpckldq xmm6, xmm2 - xor rdx, 48 - movdqu xmm2, XMMWORD PTR [rcx+r11] - pxor xmm6, xmm7 - paddq xmm2, xmm4 - movdqu xmm1, XMMWORD PTR [rax+r11] - movdqu xmm0, XMMWORD PTR [rdx+r11] - paddq xmm0, xmm5 - movdqu XMMWORD PTR [rcx+r11], xmm0 - movdqu XMMWORD PTR [rax+r11], xmm2 - movq rcx, xmm13 - paddq xmm1, xmm7 - movdqu XMMWORD PTR [rdx+r11], xmm1 - movq rdi, xmm6 - mov r10, rdi - and r10d, 2097136 - xor edx, edx - mov rax, rcx - shl rax, 32 - movq rbx, xmm10 - xor rbx, rax - lea r9, QWORD PTR [rcx+rcx] - add r9d, edi - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - mov ecx, -2147483647 - movdqu XMMWORD PTR [r13], xmm0 - or r9, rcx - movdqa xmm0, xmm6 - movaps xmm1, xmm9 - psrldq xmm0, 8 - movq rax, xmm0 - xor rbx, QWORD PTR [r10+r11] - lea r14, QWORD PTR [r10+r11] - mov rbp, QWORD PTR [r14+8] - div r9 - shl rdx, 32 - mov eax, eax - add rdx, rax - lea r9, QWORD PTR [rdx+rdi] - movq xmm10, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm1, xmm0 - movq rdx, xmm1 - test rdx, 524287 - je sqrt_fixup_soft_aes_fast2_sandybridge - psrlq xmm1, 19 -sqrt_fixup_soft_aes_fast2_sandybridge_ret: - - mov r9, r10 - movdqa xmm13, xmm1 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - mov rax, rbx - mul rdi - movdqu xmm2, XMMWORD PTR [r9+r11] - movdqu xmm1, XMMWORD PTR [rcx+r11] - paddq xmm1, xmm7 - movq xmm0, rax - movq xmm3, rdx - xor rax, QWORD PTR [r11+rcx+8] - xor rdx, QWORD PTR [rcx+r11] - punpcklqdq xmm3, xmm0 - add r8, rdx - movdqu xmm0, XMMWORD PTR [r10+r11] - pxor xmm2, xmm3 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [r9+r11], xmm0 - movdqa xmm5, xmm4 - mov r9, QWORD PTR [rsp+240] - movdqa xmm4, xmm6 - add r9, rax - movdqu XMMWORD PTR [rcx+r11], xmm2 - movdqu XMMWORD PTR [r10+r11], xmm1 - mov r10, QWORD PTR [rsp+224] - movd r12d, xmm11 - mov QWORD PTR [r14], r8 - xor r8, rbx - mov rax, r8 - mov QWORD PTR [r14+8], r9 - and eax, 2097136 - xor r9, rbp - mov QWORD PTR [rsp+240], r9 - mov QWORD PTR [rsp+248], rax - sub r12d, 1 - jne cnv2_mainloop_soft_aes_fast2_sandybridge - - ldmxcsr DWORD PTR [rsp+4] - movaps xmm6, XMMWORD PTR [rsp+16] - movaps xmm7, XMMWORD PTR [rsp+32] - movaps xmm8, XMMWORD PTR [rsp+48] - movaps xmm9, XMMWORD PTR [rsp+64] - movaps xmm10, XMMWORD PTR [rsp+80] - movaps xmm11, XMMWORD PTR [rsp+96] - movaps xmm12, XMMWORD PTR [rsp+112] - movaps xmm13, XMMWORD PTR [rsp+128] - - add rsp, 152 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp - -sqrt_fixup_soft_aes_fast2_sandybridge: - dec rdx - mov r15d, -1022 - shl r15, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - lea rcx, [rcx+r15+1] - add rax, r15 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm1, rdx - jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret - -cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp: diff --git a/src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc b/src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc deleted file mode 100644 index b6bc2e6c..00000000 --- a/src/crypto/asm/win/cn_liteupx_mainloop_sandybridge.inc +++ /dev/null @@ -1,70 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 131072 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 1048560 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - ALIGN 64 -cn_litev1_mainloop_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 1048560 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 1048560 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cn_litev1_mainloop_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 94f083c1..00000000 --- a/src/crypto/asm/win/cn_liteupx_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,162 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 1048560 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 131072 - - ALIGN 64 -cn_litev1_mainloop_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 1048560 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 1048560 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cn_litev1_mainloop_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/win/cn_litev1_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_litev1_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index d75b1def..00000000 --- a/src/crypto/asm/win/cn_litev1_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,162 +0,0 @@ - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 72 - - movaps XMMWORD PTR [rsp], xmm6 - movaps XMMWORD PTR [rsp+16], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - movaps XMMWORD PTR [rsp+48], xmm9 - - mov rax, QWORD PTR [rcx+48] - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm4, rax - mov rax, QWORD PTR [rcx+256] - mov r13, QWORD PTR [rcx+40] - movq xmm0, rdx - xor r13, QWORD PTR [rcx+8] - mov rdx, r8 - mov rdi, QWORD PTR [rcx+224] - and edx, 1048560 - mov rax, QWORD PTR [rax+35] - xor rax, QWORD PTR [rcx+192] - movq xmm5, rax - movq xmm8, rdi - punpcklqdq xmm4, xmm0 - mov QWORD PTR [rsp+64], rdx - - movq xmm6, rcx - mov rax, QWORD PTR [rcx+264] - movq xmm7, rax - - mov eax, 262144 - - ALIGN 64 -cn_litev1_mainloop_soft_aes_sandybridge: - movq xmm9, rax - mov r12, QWORD PTR [rcx+272] - mov esi, DWORD PTR [rdx+rdi] - mov r10d, DWORD PTR [rdx+rdi+4] - mov ebp, DWORD PTR [rdx+rdi+12] - mov r14d, DWORD PTR [rdx+rdi+8] - mov rdx, QWORD PTR [rsp+64] - movzx ecx, sil - shr esi, 8 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - add ebp, 256 - movd xmm1, r11d - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movq rdi, xmm8 - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - punpckldq xmm2, xmm1 - movq xmm1, r8 - xor eax, DWORD PTR [r12+rcx*4] - xor eax, r15d - movd xmm3, eax - movq rax, xmm7 - punpckldq xmm3, xmm0 - movq xmm0, r13 - punpcklqdq xmm1, xmm0 - punpckldq xmm3, xmm2 - pxor xmm3, xmm1 - movq r9, xmm3 - mov r10d, r9d - and r10d, 1048560 - movdqa xmm0, xmm3 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx+rdi], xmm0 - psrldq xmm0, 11 - movq rcx, xmm0 - movzx ecx, cl - mov cl, BYTE PTR [rcx+rax] - mov BYTE PTR [rdi+rdx+11], cl - mov rbx, QWORD PTR [r10+rdi] - mov rcx, r9 - lea r9, QWORD PTR [r10+rdi] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - movdqa xmm4, xmm3 - mul rcx - movq rcx, xmm6 - add r8, rdx - add r13, rax - movq rax, xmm5 - xor rax, r13 - mov QWORD PTR [r9], r8 - xor r8, rbx - mov QWORD PTR [r9+8], rax - movq rax, xmm9 - mov rdx, r8 - xor r13, r11 - and edx, 1048560 - mov QWORD PTR [rsp+64], rdx - sub eax, 1 - jne cn_litev1_mainloop_soft_aes_sandybridge - - movaps xmm6, XMMWORD PTR [rsp] - movaps xmm7, XMMWORD PTR [rsp+16] - movaps xmm8, XMMWORD PTR [rsp+32] - movaps xmm9, XMMWORD PTR [rsp+48] - - add rsp, 72 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx diff --git a/src/crypto/asm/win/cn_main_loop.asm b/src/crypto/asm/win/cn_main_loop.asm index b9635a36..e01e3134 100644 --- a/src/crypto/asm/win/cn_main_loop.asm +++ b/src/crypto/asm/win/cn_main_loop.asm @@ -1,166 +1,171 @@ _TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE -PUBLIC cnv1_mainloop_sandybridge_asm -PUBLIC cn_litev1_mainloop_sandybridge_asm -PUBLIC cn_fast_mainloop_sandybridge_asm -PUBLIC cnv2_mainloop_ivybridge_asm -PUBLIC cnv2_mainloop_ryzen_asm -PUBLIC cnv2_mainloop_bulldozer_asm -PUBLIC cnv2_double_mainloop_sandybridge_asm -PUBLIC cn_fastv2_mainloop_ivybridge_asm -PUBLIC cn_fastv2_mainloop_ryzen_asm -PUBLIC cn_fastv2_mainloop_bulldozer_asm -PUBLIC cn_fastv2_double_mainloop_sandybridge_asm -PUBLIC cn_liteupx_mainloop_sandybridge_asm -PUBLIC cn_ultralitev2_mainloop_ivybridge_asm -PUBLIC cn_ultralitev2_mainloop_ryzen_asm -PUBLIC cn_ultralitev2_mainloop_bulldozer_asm -PUBLIC cn_ultralitev2_double_mainloop_sandybridge_asm -PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm -PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm -PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm -PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm -PUBLIC cn_fastv2_mainloop_soft_aes_sandybridge_asm -PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm -PUBLIC cn_ultralitev2_mainloop_soft_aes_sandybridge_asm +PUBLIC cnv1_main_loop_sandybridge_asm +PUBLIC cnv1_main_loop_lite_sandybridge_asm +PUBLIC cnv1_main_loop_fast_sandybridge_asm +PUBLIC cnv1_main_loop_upx_sandybridge_asm + +PUBLIC cnv2_main_loop_ivybridge_asm +PUBLIC cnv2_main_loop_ryzen_asm +PUBLIC cnv2_main_loop_bulldozer_asm +PUBLIC cnv2_double_main_loop_sandybridge_asm + +PUBLIC cnv2_main_loop_fastv2_ivybridge_asm +PUBLIC cnv2_main_loop_fastv2_ryzen_asm +PUBLIC cnv2_main_loop_fastv2_bulldozer_asm +PUBLIC cnv2_double_main_loop_fastv2_sandybridge_asm + +PUBLIC cnv2_main_loop_ultralite_ivybridge_asm +PUBLIC cnv2_main_loop_ultralite_ryzen_asm +PUBLIC cnv2_main_loop_ultralite_bulldozer_asm +PUBLIC cnv2_double_main_loop_ultralite_sandybridge_asm + +PUBLIC cnv1_main_loop_soft_aes_sandybridge_asm +PUBLIC cnv1_main_loop_lite_soft_aes_sandybridge_asm +PUBLIC cnv1_main_loop_fast_soft_aes_sandybridge_asm +PUBLIC cnv1_main_loop_upx_soft_aes_sandybridge_asm + +PUBLIC cnv2_main_loop_soft_aes_sandybridge_asm +PUBLIC cnv2_main_loop_fastv2_soft_aes_sandybridge_asm +PUBLIC cnv2_main_loop_ultralite_soft_aes_sandybridge_asm ALIGN 64 -cnv1_mainloop_sandybridge_asm PROC - INCLUDE cnv1_mainloop_sandybridge.inc +cnv1_main_loop_sandybridge_asm PROC + INCLUDE cnv1_main_loop_sandybridge.inc ret 0 -cnv1_mainloop_sandybridge_asm ENDP +cnv1_main_loop_sandybridge_asm ENDP ALIGN 64 -cn_litev1_mainloop_sandybridge_asm PROC - INCLUDE cn_litev1_mainloop_sandybridge.inc +cnv1_main_loop_lite_sandybridge_asm PROC + INCLUDE cnv1_main_loop_lite_sandybridge.inc ret 0 -cn_litev1_mainloop_sandybridge_asm ENDP +cnv1_main_loop_lite_sandybridge_asm ENDP ALIGN 64 -cn_fast_mainloop_sandybridge_asm PROC - INCLUDE cn_fast_mainloop_sandybridge.inc +cnv1_main_loop_fast_sandybridge_asm PROC + INCLUDE cnv1_main_loop_fast_sandybridge.inc ret 0 -cn_fast_mainloop_sandybridge_asm ENDP +cnv1_main_loop_fast_sandybridge_asm ENDP ALIGN 64 -cnv2_mainloop_ivybridge_asm PROC +cnv1_main_loop_upx_sandybridge_asm PROC + INCLUDE cnv1_main_loop_upx_sandybridge.inc + ret 0 +cnv1_main_loop_upx_sandybridge_asm ENDP + +ALIGN 64 +cnv2_main_loop_ivybridge_asm PROC INCLUDE cnv2_main_loop_ivybridge.inc ret 0 -cnv2_mainloop_ivybridge_asm ENDP +cnv2_main_loop_ivybridge_asm ENDP ALIGN 64 -cnv2_mainloop_ryzen_asm PROC +cnv2_main_loop_ryzen_asm PROC INCLUDE cnv2_main_loop_ryzen.inc ret 0 -cnv2_mainloop_ryzen_asm ENDP +cnv2_main_loop_ryzen_asm ENDP ALIGN 64 -cnv2_mainloop_bulldozer_asm PROC +cnv2_main_loop_bulldozer_asm PROC INCLUDE cnv2_main_loop_bulldozer.inc ret 0 -cnv2_mainloop_bulldozer_asm ENDP +cnv2_main_loop_bulldozer_asm ENDP ALIGN 64 -cnv2_double_mainloop_sandybridge_asm PROC +cnv2_double_main_loop_sandybridge_asm PROC INCLUDE cnv2_double_main_loop_sandybridge.inc ret 0 -cnv2_double_mainloop_sandybridge_asm ENDP +cnv2_double_main_loop_sandybridge_asm ENDP ALIGN 64 -cn_fastv2_mainloop_ivybridge_asm PROC - INCLUDE cn_fastv2_main_loop_ivybridge.inc +cnv2_main_loop_fastv2_ivybridge_asm PROC + INCLUDE cnv2_main_loop_fastv2_ivybridge.inc ret 0 -cn_fastv2_mainloop_ivybridge_asm ENDP +cnv2_main_loop_fastv2_ivybridge_asm ENDP ALIGN 64 -cn_fastv2_mainloop_ryzen_asm PROC - INCLUDE cn_fastv2_main_loop_ryzen.inc +cnv2_main_loop_fastv2_ryzen_asm PROC + INCLUDE cnv2_main_loop_fastv2_ryzen.inc ret 0 -cn_fastv2_mainloop_ryzen_asm ENDP +cnv2_main_loop_fastv2_ryzen_asm ENDP ALIGN 64 -cn_fastv2_mainloop_bulldozer_asm PROC - INCLUDE cn_fastv2_main_loop_bulldozer.inc +cnv2_main_loop_fastv2_bulldozer_asm PROC + INCLUDE cnv2_main_loop_fastv2_bulldozer.inc ret 0 -cn_fastv2_mainloop_bulldozer_asm ENDP +cnv2_main_loop_fastv2_bulldozer_asm ENDP ALIGN 64 -cn_fastv2_double_mainloop_sandybridge_asm PROC - INCLUDE cn_fastv2_double_main_loop_sandybridge.inc +cnv2_double_main_loop_fastv2_sandybridge_asm PROC + INCLUDE cnv2_double_main_loop_fastv2_sandybridge.inc ret 0 -cn_fastv2_double_mainloop_sandybridge_asm ENDP +cnv2_double_main_loop_fastv2_sandybridge_asm ENDP ALIGN 64 -cn_liteupx_mainloop_sandybridge_asm PROC - INCLUDE cn_liteupx_mainloop_sandybridge.inc +cnv2_main_loop_ultralite_ivybridge_asm PROC + INCLUDE cnv2_main_loop_ultralite_ivybridge.inc ret 0 -cn_liteupx_mainloop_sandybridge_asm ENDP +cnv2_main_loop_ultralite_ivybridge_asm ENDP ALIGN 64 -cn_ultralitev2_mainloop_ivybridge_asm PROC - INCLUDE cn_ultralitev2_main_loop_ivybridge.inc +cnv2_main_loop_ultralite_ryzen_asm PROC + INCLUDE cnv2_main_loop_ultralite_ryzen.inc ret 0 -cn_ultralitev2_mainloop_ivybridge_asm ENDP +cnv2_main_loop_ultralite_ryzen_asm ENDP ALIGN 64 -cn_ultralitev2_mainloop_ryzen_asm PROC - INCLUDE cn_ultralitev2_main_loop_ryzen.inc +cnv2_main_loop_ultralite_bulldozer_asm PROC + INCLUDE cnv2_main_loop_ultralite_bulldozer.inc ret 0 -cn_ultralitev2_mainloop_ryzen_asm ENDP +cnv2_main_loop_ultralite_bulldozer_asm ENDP ALIGN 64 -cn_ultralitev2_mainloop_bulldozer_asm PROC - INCLUDE cn_ultralitev2_main_loop_bulldozer.inc +cnv2_double_main_loop_ultralite_sandybridge_asm PROC + INCLUDE cnv2_double_main_loop_ultralite_sandybridge.inc ret 0 -cn_ultralitev2_mainloop_bulldozer_asm ENDP +cnv2_double_main_loop_ultralite_sandybridge_asm ENDP ALIGN 64 -cn_ultralitev2_double_mainloop_sandybridge_asm PROC - INCLUDE cn_ultralitev2_double_main_loop_sandybridge.inc +cnv1_main_loop_soft_aes_sandybridge_asm PROC + INCLUDE cnv1_main_loop_soft_aes_sandybridge.inc ret 0 -cn_ultralitev2_double_mainloop_sandybridge_asm ENDP +cnv1_main_loop_soft_aes_sandybridge_asm ENDP ALIGN 64 -cnv1_mainloop_soft_aes_sandybridge_asm PROC - INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc +cnv1_main_loop_lite_soft_aes_sandybridge_asm PROC + INCLUDE cnv1_main_loop_lite_soft_aes_sandybridge.inc ret 0 -cnv1_mainloop_soft_aes_sandybridge_asm ENDP +cnv1_main_loop_lite_soft_aes_sandybridge_asm ENDP ALIGN 64 -cn_litev1_mainloop_soft_aes_sandybridge_asm PROC - INCLUDE cn_litev1_mainloop_soft_aes_sandybridge.inc +cnv1_main_loop_fast_soft_aes_sandybridge_asm PROC + INCLUDE cnv1_main_loop_fast_soft_aes_sandybridge.inc ret 0 -cn_litev1_mainloop_soft_aes_sandybridge_asm ENDP +cnv1_main_loop_fast_soft_aes_sandybridge_asm ENDP ALIGN 64 -cn_fast_mainloop_soft_aes_sandybridge_asm PROC - INCLUDE cn_fast_mainloop_soft_aes_sandybridge.inc +cnv1_main_loop_upx_soft_aes_sandybridge_asm PROC + INCLUDE cnv1_main_loop_upx_soft_aes_sandybridge.inc ret 0 -cn_fast_mainloop_soft_aes_sandybridge_asm ENDP +cnv1_main_loop_upx_soft_aes_sandybridge_asm ENDP ALIGN 64 -cnv2_mainloop_soft_aes_sandybridge_asm PROC - INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc +cnv2_main_loop_soft_aes_sandybridge_asm PROC + INCLUDE cnv2_main_loop_soft_aes_sandybridge.inc ret 0 -cnv2_mainloop_soft_aes_sandybridge_asm ENDP +cnv2_main_loop_soft_aes_sandybridge_asm ENDP ALIGN 64 -cn_fastv2_mainloop_soft_aes_sandybridge_asm PROC - INCLUDE cn_fastv2_mainloop_soft_aes_sandybridge.inc +cnv2_main_loop_fastv2_soft_aes_sandybridge_asm PROC + INCLUDE cnv2_main_loop_fastv2_soft_aes_sandybridge.inc ret 0 -cn_fastv2_mainloop_soft_aes_sandybridge_asm ENDP +cnv2_main_loop_fastv2_soft_aes_sandybridge_asm ENDP ALIGN 64 -cn_liteupx_mainloop_soft_aes_sandybridge_asm PROC - INCLUDE cn_liteupx_mainloop_soft_aes_sandybridge.inc +cnv2_main_loop_ultralite_soft_aes_sandybridge_asm PROC + INCLUDE cnv2_main_loop_ultralite_soft_aes_sandybridge.inc ret 0 -cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP - -ALIGN 64 -cn_ultralitev2_mainloop_soft_aes_sandybridge_asm PROC - INCLUDE cn_ultralitev2_mainloop_soft_aes_sandybridge.inc - ret 0 -cn_ultralitev2_mainloop_soft_aes_sandybridge_asm ENDP +cnv2_main_loop_ultralite_soft_aes_sandybridge_asm ENDP _TEXT_CN_MAINLOOP ENDS END \ No newline at end of file diff --git a/src/crypto/asm/win/cn_main_loop_win_gcc.S b/src/crypto/asm/win/cn_main_loop_win_gcc.S index 159937ef..ef48a077 100644 --- a/src/crypto/asm/win/cn_main_loop_win_gcc.S +++ b/src/crypto/asm/win/cn_main_loop_win_gcc.S @@ -3,142 +3,146 @@ # define FN_PREFIX(fn) fn .section .text -.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm) -.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) -.global FN_PREFIX(cnv2_mainloop_ryzen_asm) -.global FN_PREFIX(cnv2_mainloop_bulldozer_asm) -.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm) -.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm) -.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm) -.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm) -.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm) -.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) -.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm) +.global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm) +.global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm) + +.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm) + +.global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm) +.global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm) ALIGN 64 -FN_PREFIX(cnv1_mainloop_sandybridge_asm): - #include "../cnv1_mainloop_sandybridge.inc" +FN_PREFIX(cnv1_main_loop_sandybridge_asm): + #include "../cnv1_main_loop_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_litev1_mainloop_sandybridge_asm): - #include "../cn_litev1_mainloop_sandybridge.inc" +FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm): + #include "../cnv1_main_loop_lite_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_fast_mainloop_sandybridge_asm): - #include "../cn_fast_mainloop_sandybridge.inc" +FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm): + #include "../cnv1_main_loop_fast_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cnv2_mainloop_ivybridge_asm): +FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm): + #include "../cnv1_main_loop_upx_sandybridge.inc" + ret 0 + +ALIGN 64 +FN_PREFIX(cnv2_main_loop_ivybridge_asm): #include "../cnv2_main_loop_ivybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cnv2_mainloop_ryzen_asm): +FN_PREFIX(cnv2_main_loop_ryzen_asm): #include "../cnv2_main_loop_ryzen.inc" ret 0 ALIGN 64 -FN_PREFIX(cnv2_mainloop_bulldozer_asm): +FN_PREFIX(cnv2_main_loop_bulldozer_asm): #include "../cnv2_main_loop_bulldozer.inc" ret 0 ALIGN 64 -FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): +FN_PREFIX(cnv2_double_main_loop_sandybridge_asm): #include "../cnv2_double_main_loop_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm): - #include "../cn_fastv2_main_loop_ivybridge.inc" +FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm): + #include "../cnv2_main_loop_fastv2_ivybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_fastv2_mainloop_ryzen_asm): - #include "../cn_fastv2_main_loop_ryzen.inc" +FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm): + #include "../cnv2_main_loop_fastv2_ryzen.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm): - #include "../cn_fastv2_main_loop_bulldozer.inc" +FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm): + #include "../cnv2_main_loop_fastv2_bulldozer.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm): - #include "../cn_fastv2_double_main_loop_sandybridge.inc" +FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm): + #include "../cnv2_double_main_loop_fastv2_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): - #include "../cn_liteupx_mainloop_sandybridge.inc" +FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm): + #include "../cnv2_main_loop_ultralite_ivybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm): - #include "../cn_ultralitev2_main_loop_ivybridge.inc" +FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm): + #include "../cnv2_main_loop_ultralite_ryzen.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm): - #include "../cn_ultralitev2_main_loop_ryzen.inc" +FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm): + #include "../cnv2_main_loop_ultralite_bulldozer.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm): - #include "../cn_ultralitev2_main_loop_bulldozer.inc" +FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm): + #include "../cnv2_double_main_loop_ultralite_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm): - #include "../cn_ultralitev2_double_main_loop_sandybridge.inc" +FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm): + #include "../cnv1_main_loop_soft_aes_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): - #include "../cnv1_mainloop_soft_aes_sandybridge.inc" +FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm): + #include "../cnv1_main_loop_lite_soft_aes_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm): - #include "../cn_litev1_mainloop_soft_aes_sandybridge.inc" +FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm): + #include "../cnv1_main_loop_fast_soft_aes_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm): - #include "../cn_fast_mainloop_soft_aes_sandybridge.inc" +FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm): + #include "../cnv1_main_loop_upx_soft_aes_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): - #include "../cnv2_mainloop_soft_aes_sandybridge.inc" +FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm): + #include "../cnv2_main_loop_soft_aes_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm): - #include "../cn_fastv2_mainloop_soft_aes_sandybridge.inc" +FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm): + #include "../cnv2_main_loop_fastv2_soft_aes_sandybridge.inc" ret 0 ALIGN 64 -FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm): - #include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc" - ret 0 - -ALIGN 64 -FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm): - #include "../cn_ultralitev2_mainloop_soft_aes_sandybridge.inc" +FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm): + #include "../cnv2_main_loop_ultralite_soft_aes_sandybridge.inc" ret 0 \ No newline at end of file diff --git a/src/crypto/asm/win/cn_ultralitev2_double_main_loop_sandybridge.inc b/src/crypto/asm/win/cn_ultralitev2_double_main_loop_sandybridge.inc deleted file mode 100644 index 30d6e814..00000000 --- a/src/crypto/asm/win/cn_ultralitev2_double_main_loop_sandybridge.inc +++ /dev/null @@ -1,410 +0,0 @@ - mov rax, rsp - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 184 - - stmxcsr DWORD PTR [rsp+272] - mov DWORD PTR [rsp+276], 24448 - ldmxcsr DWORD PTR [rsp+276] - - mov r13, QWORD PTR [rcx+224] - mov r9, rdx - mov r10, QWORD PTR [rcx+32] - mov r8, rcx - xor r10, QWORD PTR [rcx] - mov r14d, 65536 - mov r11, QWORD PTR [rcx+40] - xor r11, QWORD PTR [rcx+8] - mov rsi, QWORD PTR [rdx+224] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov rdi, QWORD PTR [r9+32] - xor rdi, QWORD PTR [r9] - mov rbp, QWORD PTR [r9+40] - xor rbp, QWORD PTR [r9+8] - movq xmm0, rdx - movaps XMMWORD PTR [rax-88], xmm6 - movaps XMMWORD PTR [rax-104], xmm7 - movaps XMMWORD PTR [rax-120], xmm8 - movaps XMMWORD PTR [rsp+112], xmm9 - movaps XMMWORD PTR [rsp+96], xmm10 - movaps XMMWORD PTR [rsp+80], xmm11 - movaps XMMWORD PTR [rsp+64], xmm12 - movaps XMMWORD PTR [rsp+48], xmm13 - movaps XMMWORD PTR [rsp+32], xmm14 - movaps XMMWORD PTR [rsp+16], xmm15 - mov rdx, r10 - movq xmm4, QWORD PTR [r8+96] - and edx, 131056 - mov rax, QWORD PTR [rcx+48] - xorps xmm13, xmm13 - xor rax, QWORD PTR [rcx+16] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r8+72] - movq xmm5, QWORD PTR [r8+104] - movq xmm7, rax - - mov eax, 1 - shl rax, 52 - movq xmm14, rax - punpcklqdq xmm14, xmm14 - - mov eax, 1023 - shl rax, 52 - movq xmm12, rax - punpcklqdq xmm12, xmm12 - - mov rax, QWORD PTR [r8+80] - xor rax, QWORD PTR [r8+64] - punpcklqdq xmm7, xmm0 - movq xmm0, rcx - mov rcx, QWORD PTR [r9+56] - xor rcx, QWORD PTR [r9+24] - movq xmm3, rax - mov rax, QWORD PTR [r9+48] - xor rax, QWORD PTR [r9+16] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp], r13 - mov rcx, QWORD PTR [r9+88] - xor rcx, QWORD PTR [r9+72] - movq xmm6, rax - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - punpcklqdq xmm6, xmm0 - movq xmm0, rcx - mov QWORD PTR [rsp+256], r10 - mov rcx, rdi - mov QWORD PTR [rsp+264], r11 - movq xmm8, rax - and ecx, 131056 - punpcklqdq xmm8, xmm0 - movq xmm0, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, QWORD PTR [r9+104] - lea r8, QWORD PTR [rcx+rsi] - movdqu xmm11, XMMWORD PTR [r8] - punpcklqdq xmm5, xmm0 - lea r9, QWORD PTR [rdx+r13] - movdqu xmm15, XMMWORD PTR [r9] - - ALIGN 64 -main_loop_double_ultralitev2_sandybridge: - movdqu xmm9, xmm15 - mov eax, edx - mov ebx, edx - xor eax, 16 - xor ebx, 32 - xor edx, 48 - - movq xmm0, r11 - movq xmm2, r10 - punpcklqdq xmm2, xmm0 - aesenc xmm9, xmm2 - - movdqu xmm0, XMMWORD PTR [rax+r13] - movdqu xmm1, XMMWORD PTR [rbx+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [rbx+r13], xmm0 - movdqu xmm0, XMMWORD PTR [rdx+r13] - movdqu XMMWORD PTR [rdx+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [rax+r13], xmm0 - - movq r11, xmm9 - mov edx, r11d - and edx, 131056 - movdqa xmm0, xmm9 - pxor xmm0, xmm7 - movdqu XMMWORD PTR [r9], xmm0 - - lea rbx, QWORD PTR [rdx+r13] - mov r10, QWORD PTR [rdx+r13] - - movdqu xmm10, xmm11 - movq xmm0, rbp - movq xmm11, rdi - punpcklqdq xmm11, xmm0 - aesenc xmm10, xmm11 - - mov eax, ecx - mov r12d, ecx - xor eax, 16 - xor r12d, 32 - xor ecx, 48 - - movdqu xmm0, XMMWORD PTR [rax+rsi] - paddq xmm0, xmm6 - movdqu xmm1, XMMWORD PTR [r12+rsi] - movdqu XMMWORD PTR [r12+rsi], xmm0 - paddq xmm1, xmm11 - movdqu xmm0, XMMWORD PTR [rcx+rsi] - movdqu XMMWORD PTR [rcx+rsi], xmm1 - paddq xmm0, xmm8 - movdqu XMMWORD PTR [rax+rsi], xmm0 - - movq rcx, xmm10 - and ecx, 131056 - - movdqa xmm0, xmm10 - pxor xmm0, xmm6 - movdqu XMMWORD PTR [r8], xmm0 - mov r12, QWORD PTR [rcx+rsi] - - mov r9, QWORD PTR [rbx+8] - - xor edx, 16 - mov r8d, edx - mov r15d, edx - - movq rdx, xmm5 - shl rdx, 32 - movq rax, xmm4 - xor rdx, rax - xor r10, rdx - mov rax, r10 - mul r11 - mov r11d, r8d - xor r11d, 48 - movq xmm0, rdx - xor rdx, [r11+r13] - movq xmm1, rax - xor rax, [r11+r13+8] - punpcklqdq xmm0, xmm1 - - pxor xmm0, XMMWORD PTR [r8+r13] - xor r8d, 32 - movdqu xmm1, XMMWORD PTR [r11+r13] - paddq xmm0, xmm7 - paddq xmm1, xmm2 - movdqu XMMWORD PTR [r11+r13], xmm0 - movdqu xmm0, XMMWORD PTR [r8+r13] - movdqu XMMWORD PTR [r8+r13], xmm1 - paddq xmm0, xmm3 - movdqu XMMWORD PTR [r15+r13], xmm0 - - mov r11, QWORD PTR [rsp+256] - add r11, rdx - mov rdx, QWORD PTR [rsp+264] - add rdx, rax - mov QWORD PTR [rbx], r11 - xor r11, r10 - mov QWORD PTR [rbx+8], rdx - xor rdx, r9 - mov QWORD PTR [rsp+256], r11 - and r11d, 131056 - mov QWORD PTR [rsp+264], rdx - mov QWORD PTR [rsp+8], r11 - lea r15, QWORD PTR [r11+r13] - movdqu xmm15, XMMWORD PTR [r11+r13] - lea r13, QWORD PTR [rsi+rcx] - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movaps xmm2, xmm13 - movq r10, xmm0 - psllq xmm5, 1 - shl r10, 32 - movdqa xmm0, xmm9 - psrldq xmm0, 8 - movdqa xmm1, xmm10 - movq r11, xmm0 - psrldq xmm1, 8 - movq r8, xmm1 - psrldq xmm4, 8 - movaps xmm0, xmm13 - movq rax, xmm4 - xor r10, rax - movaps xmm1, xmm13 - xor r10, r12 - lea rax, QWORD PTR [r11+1] - shr rax, 1 - movdqa xmm3, xmm9 - punpcklqdq xmm3, xmm10 - paddq xmm5, xmm3 - movq rdx, xmm5 - psrldq xmm5, 8 - cvtsi2sd xmm2, rax - or edx, -2147483647 - lea rax, QWORD PTR [r8+1] - shr rax, 1 - movq r9, xmm5 - cvtsi2sd xmm0, rax - or r9d, -2147483647 - cvtsi2sd xmm1, rdx - unpcklpd xmm2, xmm0 - movaps xmm0, xmm13 - cvtsi2sd xmm0, r9 - unpcklpd xmm1, xmm0 - divpd xmm2, xmm1 - paddq xmm2, xmm14 - cvttsd2si rax, xmm2 - psrldq xmm2, 8 - mov rbx, rax - imul rax, rdx - sub r11, rax - js div_fix_1_ultralitev2_sandybridge -div_fix_1_ret_ultralitev2_sandybridge: - - cvttsd2si rdx, xmm2 - mov rax, rdx - imul rax, r9 - movd xmm2, r11d - movd xmm4, ebx - sub r8, rax - js div_fix_2_ultralitev2_sandybridge -div_fix_2_ret_ultralitev2_sandybridge: - - movd xmm1, r8d - movd xmm0, edx - punpckldq xmm2, xmm1 - punpckldq xmm4, xmm0 - punpckldq xmm4, xmm2 - paddq xmm3, xmm4 - movdqa xmm0, xmm3 - psrlq xmm0, 12 - paddq xmm0, xmm12 - sqrtpd xmm1, xmm0 - movq r9, xmm1 - movdqa xmm5, xmm1 - psrlq xmm5, 19 - test r9, 524287 - je sqrt_fix_1_ultralitev2_sandybridge -sqrt_fix_1_ret_ultralitev2_sandybridge: - - movq r9, xmm10 - psrldq xmm1, 8 - movq r8, xmm1 - test r8, 524287 - je sqrt_fix_2_ultralitev2_sandybridge -sqrt_fix_2_ret_ultralitev2_sandybridge: - - mov r12d, ecx - mov r8d, ecx - xor r12d, 16 - xor r8d, 32 - xor ecx, 48 - mov rax, r10 - mul r9 - movq xmm0, rax - movq xmm3, rdx - punpcklqdq xmm3, xmm0 - - movdqu xmm0, XMMWORD PTR [r12+rsi] - pxor xmm0, xmm3 - movdqu xmm1, XMMWORD PTR [r8+rsi] - xor rdx, [r8+rsi] - xor rax, [r8+rsi+8] - movdqu xmm3, XMMWORD PTR [rcx+rsi] - paddq xmm0, xmm6 - paddq xmm1, xmm11 - paddq xmm3, xmm8 - movdqu XMMWORD PTR [r8+rsi], xmm0 - movdqu XMMWORD PTR [rcx+rsi], xmm1 - movdqu XMMWORD PTR [r12+rsi], xmm3 - - add rdi, rdx - mov QWORD PTR [r13], rdi - xor rdi, r10 - mov ecx, edi - and ecx, 131056 - lea r8, QWORD PTR [rcx+rsi] - - mov rdx, QWORD PTR [r13+8] - add rbp, rax - mov QWORD PTR [r13+8], rbp - movdqu xmm11, XMMWORD PTR [rcx+rsi] - xor rbp, rdx - mov r13, QWORD PTR [rsp] - movdqa xmm3, xmm7 - mov rdx, QWORD PTR [rsp+8] - movdqa xmm8, xmm6 - mov r10, QWORD PTR [rsp+256] - movdqa xmm7, xmm9 - mov r11, QWORD PTR [rsp+264] - movdqa xmm6, xmm10 - mov r9, r15 - dec r14d - jne main_loop_double_ultralitev2_sandybridge - - ldmxcsr DWORD PTR [rsp+272] - movaps xmm13, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+184] - movaps xmm6, XMMWORD PTR [r11-24] - movaps xmm7, XMMWORD PTR [r11-40] - movaps xmm8, XMMWORD PTR [r11-56] - movaps xmm9, XMMWORD PTR [r11-72] - movaps xmm10, XMMWORD PTR [r11-88] - movaps xmm11, XMMWORD PTR [r11-104] - movaps xmm12, XMMWORD PTR [r11-120] - movaps xmm14, XMMWORD PTR [rsp+32] - movaps xmm15, XMMWORD PTR [rsp+16] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp - -div_fix_1_ultralitev2_sandybridge: - dec rbx - add r11, rdx - jmp div_fix_1_ret_ultralitev2_sandybridge - -div_fix_2_ultralitev2_sandybridge: - dec rdx - add r8, r9 - jmp div_fix_2_ret_ultralitev2_sandybridge - -sqrt_fix_1_ultralitev2_sandybridge: - movq r8, xmm3 - movdqa xmm0, xmm5 - psrldq xmm0, 8 - dec r9 - mov r11d, -1022 - shl r11, 32 - mov rax, r9 - shr r9, 19 - shr rax, 20 - mov rdx, r9 - sub rdx, rax - lea rdx, [rdx+r11+1] - add rax, r11 - imul rdx, rax - sub rdx, r8 - adc r9, 0 - movq xmm5, r9 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_ultralitev2_sandybridge - -sqrt_fix_2_ultralitev2_sandybridge: - psrldq xmm3, 8 - movq r11, xmm3 - dec r8 - mov ebx, -1022 - shl rbx, 32 - mov rax, r8 - shr r8, 19 - shr rax, 20 - mov rdx, r8 - sub rdx, rax - lea rdx, [rdx+rbx+1] - add rax, rbx - imul rdx, rax - sub rdx, r11 - adc r8, 0 - movq xmm0, r8 - punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_ultralitev2_sandybridge - -cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp: diff --git a/src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc b/src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc deleted file mode 100644 index 311f6fa3..00000000 --- a/src/crypto/asm/win/cn_ultralitev2_main_loop_bulldozer.inc +++ /dev/null @@ -1,180 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 65536 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movd xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movd xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 131056 - movaps XMMWORD PTR [rsp+48], xmm6 - movd xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movd xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movd xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 16 -cnv2_main_loop_ultralitev2_bulldozer: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movd xmm6, r8 - pinsrq xmm6, r11, 1 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - - mov edi, 1023 - shl rdi, 52 - - movd r14, xmm5 - pextrq rax, xmm5, 1 - - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 131056 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - div r9 - mov eax, eax - shl rdx, 32 - lea r15, [rax+rdx] - lea rax, [r14+r15] - shr rax, 12 - add rax, rdi - movd xmm0, rax - sqrtsd xmm1, xmm0 - movd rdi, xmm1 - test rdi, 524287 - je sqrt_fixup_ultralitev2_bulldozer - shr rdi, 19 - -sqrt_fixup_ultralitev2_bulldozer_ret: - mov rax, rsi - mul r14 - movd xmm1, rax - movd xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 131056 - movdqa xmm3, xmm5 - dec ebp - jne cnv2_main_loop_ultralitev2_bulldozer - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp cnv2_main_loop_ultralitev2_bulldozer_endp - -sqrt_fixup_ultralitev2_bulldozer: - movd r9, xmm5 - add r9, r15 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp sqrt_fixup_ultralitev2_bulldozer_ret - -cnv2_main_loop_ultralitev2_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc b/src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc deleted file mode 100644 index d2295c9a..00000000 --- a/src/crypto/asm/win/cn_ultralitev2_main_loop_ivybridge.inc +++ /dev/null @@ -1,182 +0,0 @@ - mov QWORD PTR [rsp+24], rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 80 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov esi, 65536 - mov r8, QWORD PTR [rcx+32] - mov r13d, -2147483647 - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm4, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - movq xmm3, QWORD PTR [r9+104] - movaps XMMWORD PTR [rsp+64], xmm6 - movaps XMMWORD PTR [rsp+48], xmm7 - movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 131056 - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm4, xmm0 - movq xmm0, rcx - punpcklqdq xmm5, xmm0 - movdqu xmm6, XMMWORD PTR [r10+rbx] - - ALIGN 64 -$main_loop_ultralitev2_ivybridge: - lea rdx, QWORD PTR [r10+rbx] - mov ecx, r10d - mov eax, r10d - mov rdi, r15 - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - movq xmm0, r11 - movq xmm7, r8 - punpcklqdq xmm7, xmm0 - aesenc xmm6, xmm7 - movq rbp, xmm6 - mov r9, rbp - and r9d, 131056 - movdqu xmm2, XMMWORD PTR [rcx+rbx] - movdqu xmm1, XMMWORD PTR [rax+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm1, xmm7 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [rcx+rbx], xmm0 - movdqu XMMWORD PTR [rax+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - mov r10, r9 - xor r10d, 32 - movq rcx, xmm3 - mov rax, rcx - shl rax, 32 - xor rdi, rax - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - movdqu XMMWORD PTR [rdx], xmm0 - xor rdi, QWORD PTR [r9+rbx] - lea r14, QWORD PTR [r9+rbx] - mov r12, QWORD PTR [r14+8] - xor edx, edx - lea r9d, DWORD PTR [ecx+ecx] - add r9d, ebp - movdqa xmm0, xmm6 - psrldq xmm0, 8 - or r9d, r13d - movq rax, xmm0 - div r9 - xorps xmm3, xmm3 - mov eax, eax - shl rdx, 32 - add rdx, rax - lea r9, QWORD PTR [rdx+rbp] - mov r15, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm3, xmm0 - movq rdx, xmm3 - test edx, 524287 - je $sqrt_fixup_ultralitev2_ivybridge - psrlq xmm3, 19 -$sqrt_fixup_ultralitev2_ivybridge_ret: - - mov ecx, r10d - mov rax, rdi - mul rbp - movq xmm2, rdx - xor rdx, [rcx+rbx] - add r8, rdx - mov QWORD PTR [r14], r8 - xor r8, rdi - mov edi, r8d - and edi, 131056 - movq xmm0, rax - xor rax, [rcx+rbx+8] - add r11, rax - mov QWORD PTR [r14+8], r11 - punpcklqdq xmm2, xmm0 - - mov r9d, r10d - xor r9d, 48 - xor r10d, 16 - pxor xmm2, XMMWORD PTR [r9+rbx] - movdqu xmm0, XMMWORD PTR [r10+rbx] - paddq xmm0, xmm5 - movdqu xmm1, XMMWORD PTR [rcx+rbx] - paddq xmm2, xmm4 - paddq xmm1, xmm7 - movdqa xmm5, xmm4 - movdqu XMMWORD PTR [r9+rbx], xmm0 - movdqa xmm4, xmm6 - movdqu XMMWORD PTR [rcx+rbx], xmm2 - movdqu XMMWORD PTR [r10+rbx], xmm1 - movdqu xmm6, [rdi+rbx] - mov r10d, edi - xor r11, r12 - dec rsi - jne $main_loop_ultralitev2_ivybridge - - ldmxcsr DWORD PTR [rsp] - mov rbx, QWORD PTR [rsp+160] - movaps xmm6, XMMWORD PTR [rsp+64] - movaps xmm7, XMMWORD PTR [rsp+48] - movaps xmm8, XMMWORD PTR [rsp+32] - add rsp, 80 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - jmp $cnv2_main_loop_ultralitev2_ivybridge_endp - -$sqrt_fixup_ultralitev2_ivybridge: - dec rdx - mov r13d, -1022 - shl r13, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - add rax, r13 - not r13 - sub rcx, r13 - mov r13d, -2147483647 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm3, rdx - jmp $sqrt_fixup_ultralitev2_ivybridge_ret - -$cnv2_main_loop_ultralitev2_ivybridge_endp: diff --git a/src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc b/src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc deleted file mode 100644 index d3ef878a..00000000 --- a/src/crypto/asm/win/cn_ultralitev2_main_loop_ryzen.inc +++ /dev/null @@ -1,179 +0,0 @@ - mov QWORD PTR [rsp+16], rbx - mov QWORD PTR [rsp+24], rbp - mov QWORD PTR [rsp+32], rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 64 - - stmxcsr DWORD PTR [rsp] - mov DWORD PTR [rsp+4], 24448 - ldmxcsr DWORD PTR [rsp+4] - - mov rax, QWORD PTR [rcx+48] - mov r9, rcx - xor rax, QWORD PTR [rcx+16] - mov ebp, 65536 - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r11, QWORD PTR [rcx+40] - mov r10, r8 - mov rdx, QWORD PTR [rcx+56] - movq xmm3, rax - xor rdx, QWORD PTR [rcx+24] - xor r11, QWORD PTR [rcx+8] - mov rbx, QWORD PTR [rcx+224] - mov rax, QWORD PTR [r9+80] - xor rax, QWORD PTR [r9+64] - movq xmm0, rdx - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r9+72] - mov rdi, QWORD PTR [r9+104] - and r10d, 131056 - movaps XMMWORD PTR [rsp+48], xmm6 - movq xmm4, rax - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+16], xmm8 - xorps xmm8, xmm8 - mov ax, 1023 - shl rax, 52 - movq xmm7, rax - mov r15, QWORD PTR [r9+96] - punpcklqdq xmm3, xmm0 - movq xmm0, rcx - punpcklqdq xmm4, xmm0 - - ALIGN 64 -$main_loop_ultralitev2_ryzen: - movdqa xmm5, XMMWORD PTR [r10+rbx] - movq xmm0, r11 - movq xmm6, r8 - punpcklqdq xmm6, xmm0 - lea rdx, QWORD PTR [r10+rbx] - lea r9, QWORD PTR [rdi+rdi] - shl rdi, 32 - - mov ecx, r10d - mov eax, r10d - xor ecx, 16 - xor eax, 32 - xor r10d, 48 - aesenc xmm5, xmm6 - movdqa xmm2, XMMWORD PTR [rcx+rbx] - movdqa xmm1, XMMWORD PTR [rax+rbx] - movdqa xmm0, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - paddq xmm0, xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm0 - movdqa XMMWORD PTR [rax+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movaps xmm1, xmm8 - mov rsi, r15 - xor rsi, rdi - movq r14, xmm5 - movdqa xmm0, xmm5 - pxor xmm0, xmm3 - mov r10, r14 - and r10d, 131056 - movdqa XMMWORD PTR [rdx], xmm0 - xor rsi, QWORD PTR [r10+rbx] - lea r12, QWORD PTR [r10+rbx] - mov r13, QWORD PTR [r10+rbx+8] - - add r9d, r14d - or r9d, -2147483647 - xor edx, edx - movdqa xmm0, xmm5 - psrldq xmm0, 8 - movq rax, xmm0 - - div r9 - movq xmm0, rax - movq xmm1, rdx - punpckldq xmm0, xmm1 - movq r15, xmm0 - paddq xmm0, xmm5 - movdqa xmm2, xmm0 - psrlq xmm0, 12 - paddq xmm0, xmm7 - sqrtsd xmm1, xmm0 - movq rdi, xmm1 - test rdi, 524287 - je $sqrt_fixup_ultralitev2_ryzen - shr rdi, 19 - -$sqrt_fixup_ultralitev2_ryzen_ret: - mov rax, rsi - mul r14 - movq xmm1, rax - movq xmm0, rdx - punpcklqdq xmm0, xmm1 - - mov r9d, r10d - mov ecx, r10d - xor r9d, 16 - xor ecx, 32 - xor r10d, 48 - movdqa xmm1, XMMWORD PTR [rcx+rbx] - xor rdx, [rcx+rbx] - xor rax, [rcx+rbx+8] - movdqa xmm2, XMMWORD PTR [r9+rbx] - pxor xmm2, xmm0 - paddq xmm4, XMMWORD PTR [r10+rbx] - paddq xmm2, xmm3 - paddq xmm1, xmm6 - movdqa XMMWORD PTR [r9+rbx], xmm4 - movdqa XMMWORD PTR [rcx+rbx], xmm2 - movdqa XMMWORD PTR [r10+rbx], xmm1 - - movdqa xmm4, xmm3 - add r8, rdx - add r11, rax - mov QWORD PTR [r12], r8 - xor r8, rsi - mov QWORD PTR [r12+8], r11 - mov r10, r8 - xor r11, r13 - and r10d, 131056 - movdqa xmm3, xmm5 - dec ebp - jne $main_loop_ultralitev2_ryzen - - ldmxcsr DWORD PTR [rsp] - movaps xmm6, XMMWORD PTR [rsp+48] - lea r11, QWORD PTR [rsp+64] - mov rbx, QWORD PTR [r11+56] - mov rbp, QWORD PTR [r11+64] - mov rsi, QWORD PTR [r11+72] - movaps xmm8, XMMWORD PTR [r11-48] - movaps xmm7, XMMWORD PTR [rsp+32] - mov rsp, r11 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - jmp $cnv2_main_loop_ultralitev2_ryzen_endp - -$sqrt_fixup_ultralitev2_ryzen: - movq r9, xmm2 - dec rdi - mov edx, -1022 - shl rdx, 32 - mov rax, rdi - shr rdi, 19 - shr rax, 20 - mov rcx, rdi - sub rcx, rax - lea rcx, [rcx+rdx+1] - add rax, rdx - imul rcx, rax - sub rcx, r9 - adc rdi, 0 - jmp $sqrt_fixup_ultralitev2_ryzen_ret - -$cnv2_main_loop_ultralitev2_ryzen_endp: diff --git a/src/crypto/asm/win/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc deleted file mode 100644 index 7025a29e..00000000 --- a/src/crypto/asm/win/cn_ultralitev2_mainloop_soft_aes_sandybridge.inc +++ /dev/null @@ -1,267 +0,0 @@ - mov QWORD PTR [rsp+8], rcx - push rbx - push rbp - push rsi - push rdi - push r12 - push r13 - push r14 - push r15 - sub rsp, 152 - - stmxcsr DWORD PTR [rsp+4] - mov DWORD PTR [rsp], 24448 - ldmxcsr DWORD PTR [rsp] - - mov rax, QWORD PTR [rcx+48] - mov r10, rcx - xor rax, QWORD PTR [rcx+16] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - mov r9, QWORD PTR [rcx+40] - xor r9, QWORD PTR [rcx+8] - movq xmm4, rax - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r11, QWORD PTR [rcx+224] - mov rcx, QWORD PTR [rcx+88] - xor rcx, QWORD PTR [r10+72] - mov rax, QWORD PTR [r10+80] - movq xmm0, rdx - xor rax, QWORD PTR [r10+64] - - movaps XMMWORD PTR [rsp+16], xmm6 - movaps XMMWORD PTR [rsp+32], xmm7 - movaps XMMWORD PTR [rsp+48], xmm8 - movaps XMMWORD PTR [rsp+64], xmm9 - movaps XMMWORD PTR [rsp+80], xmm10 - movaps XMMWORD PTR [rsp+96], xmm11 - movaps XMMWORD PTR [rsp+112], xmm12 - movaps XMMWORD PTR [rsp+128], xmm13 - - movq xmm5, rax - - mov ax, 1023 - shl rax, 52 - movq xmm8, rax - - mov rax, r8 - punpcklqdq xmm4, xmm0 - and eax, 131056 - movq xmm10, QWORD PTR [r10+96] - movq xmm0, rcx - mov rcx, QWORD PTR [r10+104] - xorps xmm9, xmm9 - mov QWORD PTR [rsp+248], rax - movq xmm12, r11 - mov QWORD PTR [rsp+240], r9 - punpcklqdq xmm5, xmm0 - movq xmm13, rcx - mov r12d, 65536 - - ALIGN 64 -cnv2_mainloop_soft_aes_ultralitev2_sandybridge: - movd xmm11, r12d - mov r12, QWORD PTR [r10+272] - lea r13, QWORD PTR [rax+r11] - mov esi, DWORD PTR [r13] - movq xmm0, r9 - mov r10d, DWORD PTR [r13+4] - movq xmm7, r8 - mov ebp, DWORD PTR [r13+12] - mov r14d, DWORD PTR [r13+8] - mov rdx, QWORD PTR [rsp+248] - movzx ecx, sil - shr esi, 8 - punpcklqdq xmm7, xmm0 - mov r15d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - mov edi, DWORD PTR [r12+rcx*4] - movzx ecx, r14b - shr r14d, 8 - mov ebx, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - shr ebp, 8 - mov r9d, DWORD PTR [r12+rcx*4] - movzx ecx, r10b - shr r10d, 8 - xor r15d, DWORD PTR [r12+rcx*4+1024] - movzx ecx, r14b - shr r14d, 8 - mov eax, r14d - shr eax, 8 - xor edi, DWORD PTR [r12+rcx*4+1024] - add eax, 256 - movzx ecx, bpl - shr ebp, 8 - xor ebx, DWORD PTR [r12+rcx*4+1024] - movzx ecx, sil - shr esi, 8 - xor r9d, DWORD PTR [r12+rcx*4+1024] - add r12, 2048 - movzx ecx, r10b - shr r10d, 8 - add r10d, 256 - mov r11d, DWORD PTR [r12+rax*4] - xor r11d, DWORD PTR [r12+rcx*4] - xor r11d, r9d - movzx ecx, sil - mov r10d, DWORD PTR [r12+r10*4] - shr esi, 8 - add esi, 256 - xor r10d, DWORD PTR [r12+rcx*4] - movzx ecx, bpl - xor r10d, ebx - shr ebp, 8 - movd xmm1, r11d - add ebp, 256 - movq r11, xmm12 - mov r9d, DWORD PTR [r12+rcx*4] - xor r9d, DWORD PTR [r12+rsi*4] - mov eax, DWORD PTR [r12+rbp*4] - xor r9d, edi - movzx ecx, r14b - movd xmm0, r10d - movd xmm2, r9d - xor eax, DWORD PTR [r12+rcx*4] - mov rcx, rdx - xor eax, r15d - punpckldq xmm2, xmm1 - xor rcx, 16 - movd xmm6, eax - mov rax, rdx - punpckldq xmm6, xmm0 - xor rax, 32 - punpckldq xmm6, xmm2 - xor rdx, 48 - movdqu xmm2, XMMWORD PTR [rcx+r11] - pxor xmm6, xmm7 - paddq xmm2, xmm4 - movdqu xmm1, XMMWORD PTR [rax+r11] - movdqu xmm0, XMMWORD PTR [rdx+r11] - paddq xmm0, xmm5 - movdqu XMMWORD PTR [rcx+r11], xmm0 - movdqu XMMWORD PTR [rax+r11], xmm2 - movq rcx, xmm13 - paddq xmm1, xmm7 - movdqu XMMWORD PTR [rdx+r11], xmm1 - movq rdi, xmm6 - mov r10, rdi - and r10d, 131056 - xor edx, edx - mov rax, rcx - shl rax, 32 - movq rbx, xmm10 - xor rbx, rax - lea r9, QWORD PTR [rcx+rcx] - add r9d, edi - movdqa xmm0, xmm6 - pxor xmm0, xmm4 - mov ecx, -2147483647 - movdqu XMMWORD PTR [r13], xmm0 - or r9, rcx - movdqa xmm0, xmm6 - movaps xmm1, xmm9 - psrldq xmm0, 8 - movq rax, xmm0 - xor rbx, QWORD PTR [r10+r11] - lea r14, QWORD PTR [r10+r11] - mov rbp, QWORD PTR [r14+8] - div r9 - shl rdx, 32 - mov eax, eax - add rdx, rax - lea r9, QWORD PTR [rdx+rdi] - movq xmm10, rdx - mov rax, r9 - shr rax, 12 - movq xmm0, rax - paddq xmm0, xmm8 - sqrtsd xmm1, xmm0 - movq rdx, xmm1 - test rdx, 524287 - je sqrt_fixup_soft_aes_ultralitev2_sandybridge - psrlq xmm1, 19 -sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret: - - mov r9, r10 - movdqa xmm13, xmm1 - xor r9, 16 - mov rcx, r10 - xor rcx, 32 - xor r10, 48 - mov rax, rbx - mul rdi - movdqu xmm2, XMMWORD PTR [r9+r11] - movdqu xmm1, XMMWORD PTR [rcx+r11] - paddq xmm1, xmm7 - movq xmm0, rax - movq xmm3, rdx - xor rax, QWORD PTR [r11+rcx+8] - xor rdx, QWORD PTR [rcx+r11] - punpcklqdq xmm3, xmm0 - add r8, rdx - movdqu xmm0, XMMWORD PTR [r10+r11] - pxor xmm2, xmm3 - paddq xmm0, xmm5 - paddq xmm2, xmm4 - movdqu XMMWORD PTR [r9+r11], xmm0 - movdqa xmm5, xmm4 - mov r9, QWORD PTR [rsp+240] - movdqa xmm4, xmm6 - add r9, rax - movdqu XMMWORD PTR [rcx+r11], xmm2 - movdqu XMMWORD PTR [r10+r11], xmm1 - mov r10, QWORD PTR [rsp+224] - movd r12d, xmm11 - mov QWORD PTR [r14], r8 - xor r8, rbx - mov rax, r8 - mov QWORD PTR [r14+8], r9 - and eax, 131056 - xor r9, rbp - mov QWORD PTR [rsp+240], r9 - mov QWORD PTR [rsp+248], rax - sub r12d, 1 - jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge - - ldmxcsr DWORD PTR [rsp+4] - movaps xmm6, XMMWORD PTR [rsp+16] - movaps xmm7, XMMWORD PTR [rsp+32] - movaps xmm8, XMMWORD PTR [rsp+48] - movaps xmm9, XMMWORD PTR [rsp+64] - movaps xmm10, XMMWORD PTR [rsp+80] - movaps xmm11, XMMWORD PTR [rsp+96] - movaps xmm12, XMMWORD PTR [rsp+112] - movaps xmm13, XMMWORD PTR [rsp+128] - - add rsp, 152 - pop r15 - pop r14 - pop r13 - pop r12 - pop rdi - pop rsi - pop rbp - pop rbx - jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp - -sqrt_fixup_soft_aes_ultralitev2_sandybridge: - dec rdx - mov r15d, -1022 - shl r15, 32 - mov rax, rdx - shr rdx, 19 - shr rax, 20 - mov rcx, rdx - sub rcx, rax - lea rcx, [rcx+r15+1] - add rax, r15 - imul rcx, rax - sub rcx, r9 - adc rdx, 0 - movq xmm1, rdx - jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret - -cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp: diff --git a/src/crypto/asm/win/cn_litev1_mainloop_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in similarity index 89% rename from src/crypto/asm/win/cn_litev1_mainloop_sandybridge.inc rename to src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in index 289c3de8..2fe423f4 100644 --- a/src/crypto/asm/win/cn_litev1_mainloop_sandybridge.inc +++ b/src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in @@ -5,7 +5,7 @@ push r14 push r15 mov rax, QWORD PTR [rcx+48] - mov ebp, 262144 + mov ebp, ${ITERATIONS} xor rax, QWORD PTR [rcx+16] mov rdx, QWORD PTR [rcx+56] xor rdx, QWORD PTR [rcx+24] @@ -18,7 +18,7 @@ xor rdi, QWORD PTR [rcx+8] mov rdx, r8 mov r15, QWORD PTR [rcx+264] - and edx, 1048560 + and edx, ${MASK} mov r14, QWORD PTR [rax+35] xor r14, QWORD PTR [rcx+192] mov rsi, QWORD PTR [rcx+224] @@ -26,14 +26,14 @@ movdqu xmm2, XMMWORD PTR [rdx+rsi] ALIGN 64 -cn_litev1_mainloop_sandybridge: +cnv1_main_loop_${ALGO}_sandybridge: movq xmm0, rdi movq xmm1, r8 punpcklqdq xmm1, xmm0 aesenc xmm2, xmm1 movq r10, xmm2 mov r9d, r10d - and r9d, 1048560 + and r9d, ${MASK} add r9, rsi movdqa xmm0, xmm2 pxor xmm0, xmm3 @@ -56,11 +56,11 @@ cn_litev1_mainloop_sandybridge: mov QWORD PTR [r9+8], rax xor r8, rbx mov rdx, r8 - and edx, 1048560 + and edx, ${MASK} movdqu xmm2, XMMWORD PTR [rdx+rsi] xor rdi, r11 dec ebp - jne cn_litev1_mainloop_sandybridge + jne cnv1_main_loop_${ALGO}_sandybridge mov rbx, QWORD PTR [rsp+24] mov rbp, QWORD PTR [rsp+32] diff --git a/src/crypto/asm/win/cnv1_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in similarity index 94% rename from src/crypto/asm/win/cnv1_mainloop_soft_aes_sandybridge.inc rename to src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in index beb64b9d..549ea2ea 100644 --- a/src/crypto/asm/win/cnv1_mainloop_soft_aes_sandybridge.inc +++ b/src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in @@ -26,7 +26,7 @@ xor r13, QWORD PTR [rcx+8] mov rdx, r8 mov rdi, QWORD PTR [rcx+224] - and edx, 2097136 + and edx, ${MASK} mov rax, QWORD PTR [rax+35] xor rax, QWORD PTR [rcx+192] movq xmm5, rax @@ -38,10 +38,10 @@ mov rax, QWORD PTR [rcx+264] movq xmm7, rax - mov eax, 524288 + mov eax, ${ITERATIONS} ALIGN 64 -cnv1_mainloop_soft_aes_sandybridge: +cnv1_main_loop_${ALGO}_soft_aes_sandybridge: movq xmm9, rax mov r12, QWORD PTR [rcx+272] mov esi, DWORD PTR [rdx+rdi] @@ -114,7 +114,7 @@ cnv1_mainloop_soft_aes_sandybridge: pxor xmm3, xmm1 movq r9, xmm3 mov r10d, r9d - and r10d, 2097136 + and r10d, ${MASK} movdqa xmm0, xmm3 pxor xmm0, xmm4 movdqu XMMWORD PTR [rdx+rdi], xmm0 @@ -141,10 +141,10 @@ cnv1_mainloop_soft_aes_sandybridge: movq rax, xmm9 mov rdx, r8 xor r13, r11 - and edx, 2097136 + and edx, ${MASK} mov QWORD PTR [rsp+64], rdx sub eax, 1 - jne cnv1_mainloop_soft_aes_sandybridge + jne cnv1_main_loop_${ALGO}_soft_aes_sandybridge movaps xmm6, XMMWORD PTR [rsp] movaps xmm7, XMMWORD PTR [rsp+16] diff --git a/src/crypto/asm/win/cnv1_mainloop_sandybridge.inc b/src/crypto/asm/win/cnv1_mainloop_sandybridge.inc deleted file mode 100644 index 96502993..00000000 --- a/src/crypto/asm/win/cnv1_mainloop_sandybridge.inc +++ /dev/null @@ -1,70 +0,0 @@ - mov QWORD PTR [rsp+8], rbx - mov QWORD PTR [rsp+16], rbp - mov QWORD PTR [rsp+24], rsi - mov QWORD PTR [rsp+32], rdi - push r14 - push r15 - mov rax, QWORD PTR [rcx+48] - mov ebp, 524288 - xor rax, QWORD PTR [rcx+16] - mov rdx, QWORD PTR [rcx+56] - xor rdx, QWORD PTR [rcx+24] - mov r8, QWORD PTR [rcx+32] - xor r8, QWORD PTR [rcx] - movq xmm3, rax - mov rax, QWORD PTR [rcx+256] - mov rdi, QWORD PTR [rcx+40] - movq xmm0, rdx - xor rdi, QWORD PTR [rcx+8] - mov rdx, r8 - mov r15, QWORD PTR [rcx+264] - and edx, 2097136 - mov r14, QWORD PTR [rax+35] - xor r14, QWORD PTR [rcx+192] - mov rsi, QWORD PTR [rcx+224] - punpcklqdq xmm3, xmm0 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - - ALIGN 64 -cnv1_mainloop_sandybridge: - movq xmm0, rdi - movq xmm1, r8 - punpcklqdq xmm1, xmm0 - aesenc xmm2, xmm1 - movq r10, xmm2 - mov r9d, r10d - and r9d, 2097136 - add r9, rsi - movdqa xmm0, xmm2 - pxor xmm0, xmm3 - movdqa xmm3, xmm2 - movdqu XMMWORD PTR [rdx+rsi], xmm0 - psrldq xmm0, 11 - movq rax, xmm0 - movzx eax, al - movzx eax, BYTE PTR [rax+r15] - mov BYTE PTR [rsi+rdx+11], al - mov rbx, QWORD PTR [r9] - mov r11, QWORD PTR [r9+8] - mov rax, rbx - mul r10 - add r8, rdx - mov QWORD PTR [r9], r8 - add rdi, rax - mov rax, r14 - xor rax, rdi - mov QWORD PTR [r9+8], rax - xor r8, rbx - mov rdx, r8 - and edx, 2097136 - movdqu xmm2, XMMWORD PTR [rdx+rsi] - xor rdi, r11 - dec ebp - jne cnv1_mainloop_sandybridge - - mov rbx, QWORD PTR [rsp+24] - mov rbp, QWORD PTR [rsp+32] - mov rsi, QWORD PTR [rsp+40] - mov rdi, QWORD PTR [rsp+48] - pop r15 - pop r14 diff --git a/src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in similarity index 89% rename from src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc rename to src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in index 858edd1d..2821945c 100644 --- a/src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc +++ b/src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in @@ -18,7 +18,7 @@ mov r10, QWORD PTR [rcx+32] mov r8, rcx xor r10, QWORD PTR [rcx] - mov r14d, 524288 + mov r14d, ${ITERATIONS} mov r11, QWORD PTR [rcx+40] xor r11, QWORD PTR [rcx+8] mov rsi, QWORD PTR [rdx+224] @@ -41,7 +41,7 @@ movaps XMMWORD PTR [rsp+16], xmm15 mov rdx, r10 movq xmm4, QWORD PTR [r8+96] - and edx, 2097136 + and edx, ${MASK} mov rax, QWORD PTR [rcx+48] xorps xmm13, xmm13 xor rax, QWORD PTR [rcx+16] @@ -83,7 +83,7 @@ mov rcx, rdi mov QWORD PTR [rsp+264], r11 movq xmm8, rax - and ecx, 2097136 + and ecx, ${MASK} punpcklqdq xmm8, xmm0 movq xmm0, QWORD PTR [r9+96] punpcklqdq xmm4, xmm0 @@ -95,7 +95,7 @@ movdqu xmm15, XMMWORD PTR [r9] ALIGN 64 -main_loop_double_sandybridge: +cnv2_double_main_loop_${ALGO}_sandybridge: movdqu xmm9, xmm15 mov eax, edx mov ebx, edx @@ -120,7 +120,7 @@ main_loop_double_sandybridge: movq r11, xmm9 mov edx, r11d - and edx, 2097136 + and edx, ${MASK} movdqa xmm0, xmm9 pxor xmm0, xmm7 movdqu XMMWORD PTR [r9], xmm0 @@ -151,7 +151,7 @@ main_loop_double_sandybridge: movdqu XMMWORD PTR [rax+rsi], xmm0 movq rcx, xmm10 - and ecx, 2097136 + and ecx, ${MASK} movdqa xmm0, xmm10 pxor xmm0, xmm6 @@ -199,7 +199,7 @@ main_loop_double_sandybridge: mov QWORD PTR [rbx+8], rdx xor rdx, r9 mov QWORD PTR [rsp+256], r11 - and r11d, 2097136 + and r11d, ${MASK} mov QWORD PTR [rsp+264], rdx mov QWORD PTR [rsp+8], r11 lea r15, QWORD PTR [r11+r13] @@ -249,8 +249,8 @@ main_loop_double_sandybridge: mov rbx, rax imul rax, rdx sub r11, rax - js div_fix_1_sandybridge -div_fix_1_ret_sandybridge: + js div_fix_1_${ALGO}_sandybridge +div_fix_1_ret_${ALGO}_sandybridge: cvttsd2si rdx, xmm2 mov rax, rdx @@ -258,8 +258,8 @@ div_fix_1_ret_sandybridge: movd xmm2, r11d movd xmm4, ebx sub r8, rax - js div_fix_2_sandybridge -div_fix_2_ret_sandybridge: + js div_fix_2_${ALGO}_sandybridge +div_fix_2_ret_${ALGO}_sandybridge: movd xmm1, r8d movd xmm0, edx @@ -275,15 +275,15 @@ div_fix_2_ret_sandybridge: movdqa xmm5, xmm1 psrlq xmm5, 19 test r9, 524287 - je sqrt_fix_1_sandybridge -sqrt_fix_1_ret_sandybridge: + je sqrt_fix_1_${ALGO}_sandybridge +sqrt_fix_1_ret_${ALGO}_sandybridge: movq r9, xmm10 psrldq xmm1, 8 movq r8, xmm1 test r8, 524287 - je sqrt_fix_2_sandybridge -sqrt_fix_2_ret_sandybridge: + je sqrt_fix_2_${ALGO}_sandybridge +sqrt_fix_2_ret_${ALGO}_sandybridge: mov r12d, ecx mov r8d, ecx @@ -313,7 +313,7 @@ sqrt_fix_2_ret_sandybridge: mov QWORD PTR [r13], rdi xor rdi, r10 mov ecx, edi - and ecx, 2097136 + and ecx, ${MASK} lea r8, QWORD PTR [rcx+rsi] mov rdx, QWORD PTR [r13+8] @@ -331,7 +331,7 @@ sqrt_fix_2_ret_sandybridge: movdqa xmm6, xmm10 mov r9, r15 dec r14d - jne main_loop_double_sandybridge + jne cnv2_double_main_loop_${ALGO}_sandybridge ldmxcsr DWORD PTR [rsp+272] movaps xmm13, XMMWORD PTR [rsp+48] @@ -354,19 +354,19 @@ sqrt_fix_2_ret_sandybridge: pop rsi pop rbp pop rbx - jmp cnv2_double_mainloop_asm_sandybridge_endp + jmp cnv2_double_main_loop_${ALGO}_sandybridge_endp -div_fix_1_sandybridge: +div_fix_1_${ALGO}_sandybridge: dec rbx add r11, rdx - jmp div_fix_1_ret_sandybridge + jmp div_fix_1_ret_${ALGO}_sandybridge -div_fix_2_sandybridge: +div_fix_2_${ALGO}_sandybridge: dec rdx add r8, r9 - jmp div_fix_2_ret_sandybridge + jmp div_fix_2_ret_${ALGO}_sandybridge -sqrt_fix_1_sandybridge: +sqrt_fix_1_${ALGO}_sandybridge: movq r8, xmm3 movdqa xmm0, xmm5 psrldq xmm0, 8 @@ -385,9 +385,9 @@ sqrt_fix_1_sandybridge: adc r9, 0 movq xmm5, r9 punpcklqdq xmm5, xmm0 - jmp sqrt_fix_1_ret_sandybridge + jmp sqrt_fix_1_ret_${ALGO}_sandybridge -sqrt_fix_2_sandybridge: +sqrt_fix_2_${ALGO}_sandybridge: psrldq xmm3, 8 movq r11, xmm3 dec r8 @@ -405,6 +405,6 @@ sqrt_fix_2_sandybridge: adc r8, 0 movq xmm0, r8 punpcklqdq xmm5, xmm0 - jmp sqrt_fix_2_ret_sandybridge + jmp sqrt_fix_2_ret_${ALGO}_sandybridge -cnv2_double_mainloop_asm_sandybridge_endp: +cnv2_double_main_loop_${ALGO}_sandybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc b/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in similarity index 89% rename from src/crypto/asm/win/cnv2_main_loop_bulldozer.inc rename to src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in index 55452cad..163cc0e5 100644 --- a/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc +++ b/src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in @@ -15,7 +15,7 @@ mov rax, QWORD PTR [rcx+48] mov r9, rcx xor rax, QWORD PTR [rcx+16] - mov ebp, 524288 + mov ebp, ${ITERATIONS} mov r8, QWORD PTR [rcx+32] xor r8, QWORD PTR [rcx] mov r11, QWORD PTR [rcx+40] @@ -31,7 +31,7 @@ mov rcx, QWORD PTR [rcx+88] xor rcx, QWORD PTR [r9+72] mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 + and r10d, ${MASK} movaps XMMWORD PTR [rsp+48], xmm6 movd xmm4, rax movaps XMMWORD PTR [rsp+32], xmm7 @@ -46,7 +46,7 @@ punpcklqdq xmm4, xmm0 ALIGN 16 -cnv2_main_loop_bulldozer: +cnv2_main_loop_${ALGO}_bulldozer: movdqa xmm5, XMMWORD PTR [r10+rbx] movd xmm6, r8 pinsrq xmm6, r11, 1 @@ -83,7 +83,7 @@ cnv2_main_loop_bulldozer: movdqa xmm0, xmm5 pxor xmm0, xmm3 mov r10, r14 - and r10d, 2097136 + and r10d, ${MASK} movdqa XMMWORD PTR [rdx], xmm0 xor rsi, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx] @@ -103,10 +103,10 @@ cnv2_main_loop_bulldozer: sqrtsd xmm1, xmm0 movd rdi, xmm1 test rdi, 524287 - je sqrt_fixup_bulldozer + je sqrt_fixup_${ALGO}_bulldozer shr rdi, 19 -sqrt_fixup_bulldozer_ret: +sqrt_fixup_${ALGO}_bulldozer_ret: mov rax, rsi mul r14 movd xmm1, rax @@ -138,10 +138,10 @@ sqrt_fixup_bulldozer_ret: mov QWORD PTR [r12+8], r11 mov r10, r8 xor r11, r13 - and r10d, 2097136 + and r10d, ${MASK} movdqa xmm3, xmm5 dec ebp - jne cnv2_main_loop_bulldozer + jne cnv2_main_loop_${ALGO}_bulldozer ldmxcsr DWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp+48] @@ -157,9 +157,9 @@ sqrt_fixup_bulldozer_ret: pop r13 pop r12 pop rdi - jmp cnv2_main_loop_bulldozer_endp + jmp cnv2_main_loop_${ALGO}_bulldozer_endp -sqrt_fixup_bulldozer: +sqrt_fixup_${ALGO}_bulldozer: movd r9, xmm5 add r9, r15 dec rdi @@ -175,6 +175,6 @@ sqrt_fixup_bulldozer: imul rcx, rax sub rcx, r9 adc rdi, 0 - jmp sqrt_fixup_bulldozer_ret + jmp sqrt_fixup_${ALGO}_bulldozer_ret -cnv2_main_loop_bulldozer_endp: \ No newline at end of file +cnv2_main_loop_${ALGO}_bulldozer_endp: \ No newline at end of file diff --git a/src/crypto/asm/win/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in similarity index 89% rename from src/crypto/asm/win/cnv2_main_loop_ivybridge.inc rename to src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in index a5e86b56..1094c6d7 100644 --- a/src/crypto/asm/win/cnv2_main_loop_ivybridge.inc +++ b/src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in @@ -15,7 +15,7 @@ mov rax, QWORD PTR [rcx+48] mov r9, rcx xor rax, QWORD PTR [rcx+16] - mov esi, 524288 + mov esi, ${ITERATIONS} mov r8, QWORD PTR [rcx+32] mov r13d, -2147483647 xor r8, QWORD PTR [rcx] @@ -35,7 +35,7 @@ movaps XMMWORD PTR [rsp+64], xmm6 movaps XMMWORD PTR [rsp+48], xmm7 movaps XMMWORD PTR [rsp+32], xmm8 - and r10d, 2097136 + and r10d, ${MASK} movq xmm5, rax mov ax, 1023 @@ -48,7 +48,7 @@ movdqu xmm6, XMMWORD PTR [r10+rbx] ALIGN 64 -$main_loop_ivybridge: +cnv2_main_loop_${ALGO}_ivybridge: lea rdx, QWORD PTR [r10+rbx] mov ecx, r10d mov eax, r10d @@ -62,7 +62,7 @@ $main_loop_ivybridge: aesenc xmm6, xmm7 movq rbp, xmm6 mov r9, rbp - and r9d, 2097136 + and r9d, ${MASK} movdqu xmm2, XMMWORD PTR [rcx+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm0, XMMWORD PTR [r10+rbx] @@ -105,9 +105,9 @@ $main_loop_ivybridge: sqrtsd xmm3, xmm0 movq rdx, xmm3 test edx, 524287 - je $sqrt_fixup_ivybridge + je sqrt_fixup_${ALGO}_ivybridge psrlq xmm3, 19 -$sqrt_fixup_ivybridge_ret: +sqrt_fixup_${ALGO}_ivybridge_ret: mov ecx, r10d mov rax, rdi @@ -118,7 +118,7 @@ $sqrt_fixup_ivybridge_ret: mov QWORD PTR [r14], r8 xor r8, rdi mov edi, r8d - and edi, 2097136 + and edi, ${MASK} movq xmm0, rax xor rax, [rcx+rbx+8] add r11, rax @@ -143,7 +143,7 @@ $sqrt_fixup_ivybridge_ret: mov r10d, edi xor r11, r12 dec rsi - jne $main_loop_ivybridge + jne cnv2_main_loop_${ALGO}_ivybridge ldmxcsr DWORD PTR [rsp] mov rbx, QWORD PTR [rsp+160] @@ -158,9 +158,9 @@ $sqrt_fixup_ivybridge_ret: pop rdi pop rsi pop rbp - jmp $cnv2_main_loop_ivybridge_endp + jmp cnv2_main_loop_${ALGO}_ivybridge_endp -$sqrt_fixup_ivybridge: +sqrt_fixup_${ALGO}_ivybridge: dec rdx mov r13d, -1022 shl r13, 32 @@ -177,6 +177,6 @@ $sqrt_fixup_ivybridge: sub rcx, r9 adc rdx, 0 movq xmm3, rdx - jmp $sqrt_fixup_ivybridge_ret + jmp sqrt_fixup_${ALGO}_ivybridge_ret -$cnv2_main_loop_ivybridge_endp: +cnv2_main_loop_${ALGO}_ivybridge_endp: diff --git a/src/crypto/asm/win/cnv2_main_loop_ryzen.inc b/src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in similarity index 90% rename from src/crypto/asm/win/cnv2_main_loop_ryzen.inc rename to src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in index 3ce24964..4fef1335 100644 --- a/src/crypto/asm/win/cnv2_main_loop_ryzen.inc +++ b/src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in @@ -15,7 +15,7 @@ mov rax, QWORD PTR [rcx+48] mov r9, rcx xor rax, QWORD PTR [rcx+16] - mov ebp, 524288 + mov ebp, ${ITERATIONS} mov r8, QWORD PTR [rcx+32] xor r8, QWORD PTR [rcx] mov r11, QWORD PTR [rcx+40] @@ -31,7 +31,7 @@ mov rcx, QWORD PTR [rcx+88] xor rcx, QWORD PTR [r9+72] mov rdi, QWORD PTR [r9+104] - and r10d, 2097136 + and r10d, ${MASK} movaps XMMWORD PTR [rsp+48], xmm6 movq xmm4, rax movaps XMMWORD PTR [rsp+32], xmm7 @@ -46,7 +46,7 @@ punpcklqdq xmm4, xmm0 ALIGN 64 -$main_loop_ryzen: +cnv2_main_loop_${ALGO}_ryzen: movdqa xmm5, XMMWORD PTR [r10+rbx] movq xmm0, r11 movq xmm6, r8 @@ -78,7 +78,7 @@ $main_loop_ryzen: movdqa xmm0, xmm5 pxor xmm0, xmm3 mov r10, r14 - and r10d, 2097136 + and r10d, ${MASK} movdqa XMMWORD PTR [rdx], xmm0 xor rsi, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx] @@ -103,10 +103,10 @@ $main_loop_ryzen: sqrtsd xmm1, xmm0 movq rdi, xmm1 test rdi, 524287 - je $sqrt_fixup_ryzen + je sqrt_fixup_${ALGO}_ryzen shr rdi, 19 -$sqrt_fixup_ryzen_ret: +sqrt_fixup_${ALGO}_ryzen_ret: mov rax, rsi mul r14 movq xmm1, rax @@ -138,10 +138,10 @@ $sqrt_fixup_ryzen_ret: mov QWORD PTR [r12+8], r11 mov r10, r8 xor r11, r13 - and r10d, 2097136 + and r10d, ${MASK} movdqa xmm3, xmm5 dec ebp - jne $main_loop_ryzen + jne cnv2_main_loop_${ALGO}_ryzen ldmxcsr DWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp+48] @@ -157,9 +157,9 @@ $sqrt_fixup_ryzen_ret: pop r13 pop r12 pop rdi - jmp $cnv2_main_loop_ryzen_endp + jmp cnv2_main_loop_${ALGO}_ryzen_endp -$sqrt_fixup_ryzen: +sqrt_fixup_${ALGO}_ryzen: movq r9, xmm2 dec rdi mov edx, -1022 @@ -174,6 +174,6 @@ $sqrt_fixup_ryzen: imul rcx, rax sub rcx, r9 adc rdi, 0 - jmp $sqrt_fixup_ryzen_ret + jmp sqrt_fixup_${ALGO}_ryzen_ret -$cnv2_main_loop_ryzen_endp: +cnv2_main_loop_${ALGO}_ryzen_endp: diff --git a/src/crypto/asm/win/cnv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in similarity index 91% rename from src/crypto/asm/win/cnv2_mainloop_soft_aes_sandybridge.inc rename to src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in index 7771ad47..87d8d10e 100644 --- a/src/crypto/asm/win/cnv2_mainloop_soft_aes_sandybridge.inc +++ b/src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in @@ -47,7 +47,7 @@ mov rax, r8 punpcklqdq xmm4, xmm0 - and eax, 2097136 + and eax, ${MASK} movq xmm10, QWORD PTR [r10+96] movq xmm0, rcx mov rcx, QWORD PTR [r10+104] @@ -57,10 +57,10 @@ mov QWORD PTR [rsp+240], r9 punpcklqdq xmm5, xmm0 movq xmm13, rcx - mov r12d, 524288 + mov r12d, ${ITERATIONS} ALIGN 64 -cnv2_mainloop_soft_aes_sandybridge: +cnv2_main_loop_${ALGO}_soft_aes_sandybridge: movd xmm11, r12d mov r12, QWORD PTR [r10+272] lea r13, QWORD PTR [rax+r11] @@ -148,7 +148,7 @@ cnv2_mainloop_soft_aes_sandybridge: movdqu XMMWORD PTR [rdx+r11], xmm1 movq rdi, xmm6 mov r10, rdi - and r10d, 2097136 + and r10d, ${MASK} xor edx, edx mov rax, rcx shl rax, 32 @@ -181,9 +181,9 @@ cnv2_mainloop_soft_aes_sandybridge: sqrtsd xmm1, xmm0 movq rdx, xmm1 test rdx, 524287 - je sqrt_fixup_soft_aes_sandybridge + je sqrt_fixup_${ALGO}_soft_aes_sandybridge psrlq xmm1, 19 -sqrt_fixup_soft_aes_sandybridge_ret: +sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret: mov r9, r10 movdqa xmm13, xmm1 @@ -219,12 +219,12 @@ sqrt_fixup_soft_aes_sandybridge_ret: xor r8, rbx mov rax, r8 mov QWORD PTR [r14+8], r9 - and eax, 2097136 + and eax, ${MASK} xor r9, rbp mov QWORD PTR [rsp+240], r9 mov QWORD PTR [rsp+248], rax sub r12d, 1 - jne cnv2_mainloop_soft_aes_sandybridge + jne cnv2_main_loop_${ALGO}_soft_aes_sandybridge ldmxcsr DWORD PTR [rsp+4] movaps xmm6, XMMWORD PTR [rsp+16] @@ -245,9 +245,9 @@ sqrt_fixup_soft_aes_sandybridge_ret: pop rsi pop rbp pop rbx - jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp + jmp cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp -sqrt_fixup_soft_aes_sandybridge: +sqrt_fixup_${ALGO}_soft_aes_sandybridge: dec rdx mov r15d, -1022 shl r15, 32 @@ -262,6 +262,6 @@ sqrt_fixup_soft_aes_sandybridge: sub rcx, r9 adc rdx, 0 movq xmm1, rdx - jmp sqrt_fixup_soft_aes_sandybridge_ret + jmp sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret -cnv2_mainloop_soft_aes_sandybridge_asm_endp: +cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp: diff --git a/src/net/Client.cpp b/src/net/Client.cpp index d2ecf074..21e1c076 100644 --- a/src/net/Client.cpp +++ b/src/net/Client.cpp @@ -52,6 +52,7 @@ int64_t Client::m_sequence = 1; Client::Client(int id, const char *agent, IClientListener *listener) : m_quiet(false), m_nicehash(false), + m_donate(false), m_agent(agent), m_listener(listener), m_id(id), @@ -231,7 +232,7 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code) PowVariant powVariant = Options::i()->powVariant(); - if (!Options::i()->forcePowVariant()) { + if (!Options::i()->forcePowVariant() || m_donate) { if (params.HasMember("algo")) { std::string algo = params["algo"].GetString(); diff --git a/src/net/Client.h b/src/net/Client.h index 2cb5175b..a42fdb4e 100644 --- a/src/net/Client.h +++ b/src/net/Client.h @@ -67,6 +67,7 @@ public: inline int id() const { return m_id; } inline uint16_t port() const { return m_url.port(); } inline void setQuiet(bool quiet) { m_quiet = quiet; } + inline void setDonate(bool donate) { m_donate = donate; } inline void setRetryPause(int ms) { m_retryPause = ms; } static void onConnected(uv_async_t *handle); @@ -99,6 +100,7 @@ private: bool m_quiet; bool m_nicehash; + bool m_donate; char m_buf[2048]; char m_rpcId[64]; char m_sendBuf[768]; diff --git a/src/net/Job.cpp b/src/net/Job.cpp index 6ad4feb1..0e4febf9 100644 --- a/src/net/Job.cpp +++ b/src/net/Job.cpp @@ -150,9 +150,9 @@ PowVariant Job::powVariant() const } else { return PowVariant::POW_V0; } - } else if (m_powVariant == PowVariant::POW_XTL) { - if (m_blob[0] > 5) { - return PowVariant::POW_XTL_V9; + } else if (m_powVariant == PowVariant::POW_MSR) { + if (m_blob[0] > 8) { + return PowVariant::POW_FAST_2; } } diff --git a/src/net/Job.h b/src/net/Job.h index 4bd4642c..74fc5dc8 100644 --- a/src/net/Job.h +++ b/src/net/Job.h @@ -67,7 +67,7 @@ public: bool operator!=(const Job &other) const; private: - uint8_t m_blob[96]; // Max blob size is 84 (75 fixed + 9 variable), aligned to 96. https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk. + uint8_t m_blob[MAX_BLOB_SIZE]; // Max blob size is 84 (75 fixed + 9 variable), aligned to 96. https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk. bool m_nicehash; int m_poolId; diff --git a/src/net/strategies/DonateStrategy.cpp b/src/net/strategies/DonateStrategy.cpp index b3313a5a..b1319a1d 100644 --- a/src/net/strategies/DonateStrategy.cpp +++ b/src/net/strategies/DonateStrategy.cpp @@ -81,6 +81,7 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) : m_client->setUrl(url); m_client->setRetryPause(Options::i()->retryPause() * 1000); m_client->setQuiet(true); + m_client->setDonate(true); delete url; diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index 15389fbb..a2ffc84a 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -63,7 +63,7 @@ public: State(size_t hashMultiplier) { nonces = new uint32_t[hashMultiplier]; - blob = new uint8_t[84 * hashMultiplier]; + blob = new uint8_t[MAX_BLOB_SIZE * hashMultiplier]; for(size_t i=0; i