Refactored ASM code

- Removed duplicate code
- Autogenerated ASM files based on templates
- Cleanup in naming
This commit is contained in:
Ben Gräf 2019-01-22 20:29:13 +01:00
parent 6574a8e844
commit 2ec65c7a20
63 changed files with 712 additions and 6907 deletions

View file

@ -219,24 +219,7 @@ else()
endif(WITH_CC_SERVER OR WITH_CC_CLIENT) endif(WITH_CC_SERVER OR WITH_CC_CLIENT)
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
if (CMAKE_C_COMPILER_ID MATCHES MSVC) include(cmake/asm.cmake)
enable_language(ASM_MASM)
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm")
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
else()
enable_language(ASM)
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S")
else()
set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S")
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
endif()
add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE})
set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C)
else() else()
add_definitions(/DXMRIG_NO_ASM) add_definitions(/DXMRIG_NO_ASM)
endif(WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) endif(WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)

115
cmake/asm.cmake Normal file
View file

@ -0,0 +1,115 @@
# CN v1 original
set(ALGO "original")
set(ITERATIONS "524288") #0x80000
set(MASK "2097136") #0x1FFFF0
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_sandybridge.inc")
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc")
# CN v2 ORIGINAL
set(ALGO "originalv2")
set(ITERATIONS "524288") #0x80000
set(MASK "2097136") #0x1FFFF0
configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ivybridge.inc")
configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_bulldozer.inc")
configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_ryzen.inc")
configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_sandybridge.inc")
configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ivybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_bulldozer.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_ryzen.inc")
configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc")
# CN v1 FAST
set(ALGO "fast")
set(ITERATIONS "262144") #0x40000
set(MASK "2097136") #0x1FFFF0
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_fast_sandybridge.inc")
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_fast_soft_aes_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_fast_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_fast_soft_aes_sandybridge.inc")
# CN v2 FAST
set(ALGO "fastv2")
set(ITERATIONS "262144") #0x40000
set(MASK "2097136") #0x1FFFF0
configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_ivybridge.inc")
configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_bulldozer.inc")
configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_ryzen.inc")
configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_fastv2_sandybridge.inc")
configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_ivybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_bulldozer.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_ryzen.inc")
configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_fastv2_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc")
# CN LITE
set(ALGO "lite")
set(ITERATIONS "262144") #0x40000
set(MASK "1048560") #0xFFFF0
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_lite_sandybridge.inc")
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_lite_soft_aes_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_lite_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_lite_soft_aes_sandybridge.inc")
# CN UPX
set(ALGO "upx")
set(ITERATIONS "131072") #0x20000
set(MASK "1048560") #0xFFFF0
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_upx_sandybridge.inc")
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_upx_soft_aes_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_upx_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_upx_soft_aes_sandybridge.inc")
# CN V2 ULTRALITE
set(ALGO "ultralite")
set(ITERATIONS "65536") #0x10000
set(MASK "131056") #0x1FFF0
configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_ivybridge.inc")
configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_bulldozer.inc")
configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_ryzen.inc")
configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_ultralite_sandybridge.inc")
configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_ivybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_bulldozer.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_ryzen.inc")
configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc")
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
enable_language(ASM_MASM)
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm")
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
else()
enable_language(ASM)
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S")
else()
set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S")
endif()
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
endif()
add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE})
set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C)

View file

@ -328,7 +328,7 @@ constexpr static const char *pow_variant_names[] = {
"xhv", "xhv",
"rto", "rto",
"xfh", "xfh",
"xtlv9", "fast2",
"upx", "upx",
"turtle" "turtle"
}; };
@ -420,10 +420,7 @@ Options::Options(int argc, char **argv) :
{ {
m_pools.push_back(new Url()); m_pools.push_back(new Url());
parseConfig(Platform::defaultConfigName());
int key; int key;
while (true) { while (true) {
key = getopt_long(argc, argv, short_options, options, nullptr); key = getopt_long(argc, argv, short_options, options, nullptr);
if (key < 0) { if (key < 0) {
@ -440,6 +437,10 @@ Options::Options(int argc, char **argv) :
return; return;
} }
if (!m_pools[0]->isValid() && (!m_ccHost || m_ccPort == 0)) {
parseConfig(Platform::defaultConfigName());
}
#ifdef XMRIG_CC_SERVER #ifdef XMRIG_CC_SERVER
if (m_ccPort == 0) { if (m_ccPort == 0) {
fprintf(stderr, "No CC Server Port supplied. Exiting.\n"); fprintf(stderr, "No CC Server Port supplied. Exiting.\n");
@ -1176,8 +1177,10 @@ bool Options::parsePowVariant(const char *powVariant)
break; break;
} }
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2") || !strcmp(powVariant, "half"))) { if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2") ||
m_powVariant = POW_XTL_V9; !strcmp(powVariant, "half") || !strcmp(powVariant, "msr2") ||
!strcmp(powVariant, "xtlv9"))) {
m_powVariant = POW_FAST_2;
break; break;
} }
@ -1186,7 +1189,7 @@ bool Options::parsePowVariant(const char *powVariant)
break; break;
} }
if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "trtl")) { if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "trtl") || !strcmp(powVariant, "turtlev2") || !strcmp(powVariant, "pico"))) {
m_powVariant = POW_TURTLE; m_powVariant = POW_TURTLE;
break; break;
} }

View file

@ -29,6 +29,8 @@
#define MAX_NUM_HASH_BLOCKS 5 #define MAX_NUM_HASH_BLOCKS 5
#endif #endif
#define MAX_BLOB_SIZE 128
#include <cstdint> #include <cstdint>
#include <vector> #include <vector>

View file

@ -35,7 +35,7 @@ enum PowVariant
POW_XHV, POW_XHV,
POW_RTO, POW_RTO,
POW_XFH, POW_XFH,
POW_XTL_V9, POW_FAST_2,
POW_UPX, POW_UPX,
POW_TURTLE, POW_TURTLE,
LAST_ITEM LAST_ITEM
@ -65,8 +65,8 @@ inline std::string getPowVariantName(PowVariant powVariant)
return "rto"; return "rto";
case POW_XFH: case POW_XFH:
return "xfh"; return "xfh";
case POW_XTL_V9: case POW_FAST_2:
return "xtlv9"; return "fast2";
case POW_UPX: case POW_UPX:
return "upx"; return "upx";
case POW_TURTLE: case POW_TURTLE:
@ -138,11 +138,11 @@ inline PowVariant parseVariant(const std::string variant)
powVariant = PowVariant::POW_RTO; powVariant = PowVariant::POW_RTO;
} else if (variant == "xfh" || variant == "freehaven" || variant == "faven") { } else if (variant == "xfh" || variant == "freehaven" || variant == "faven") {
powVariant = PowVariant::POW_XFH; powVariant = PowVariant::POW_XFH;
} else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2" || variant == "half") { } else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2" || variant == "half" || variant == "msr2" || variant == "fast2") {
powVariant = PowVariant::POW_XTL_V9; powVariant = PowVariant::POW_FAST_2;
} else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") { } else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") {
powVariant = PowVariant::POW_UPX; powVariant = PowVariant::POW_UPX;
} else if (variant == "turtle" || variant == "trtl") { } else if (variant == "turtle" || variant == "trtl" || variant == "pico" || variant == "turtlev2") {
powVariant = PowVariant::POW_TURTLE; powVariant = PowVariant::POW_TURTLE;
} }

View file

@ -70,7 +70,7 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
} }
#endif #endif
} else if (powVersion == PowVariant::POW_XTL_V9) { } else if (powVersion == PowVariant::POW_FAST_2) {
#if defined(XMRIG_ARM) #if defined(XMRIG_ARM)
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
#else #else
@ -124,7 +124,7 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
} }
#endif #endif
} else if (powVersion == PowVariant::POW_XTL_V9) { } else if (powVersion == PowVariant::POW_FAST_2) {
#if defined(XMRIG_ARM) #if defined(XMRIG_ARM)
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
#else #else
@ -237,6 +237,7 @@ static void cryptonight_super_lite_softaes(AsmOptimization asmOptimization, PowV
template <size_t NUM_HASH_BLOCKS> template <size_t NUM_HASH_BLOCKS>
static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
# if !defined(XMRIG_ARMv7)
#if defined(XMRIG_ARM) #if defined(XMRIG_ARM)
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
#else #else
@ -248,6 +249,7 @@ static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVar
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad); CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
} }
#endif #endif
# endif
} }
template <size_t NUM_HASH_BLOCKS> template <size_t NUM_HASH_BLOCKS>
@ -642,7 +644,7 @@ bool CryptoNight::selfTest(int algo)
// cnv8 + xtl aka cn-fast2 // cnv8 + xtl aka cn-fast2
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL_V9, test_input, 76, output, scratchPads); cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads);
result = result && memcmp(output, test_output_xtl_v9, 32) == 0; result = result && memcmp(output, test_output_xtl_v9, 32) == 0;
} }

View file

@ -1434,70 +1434,68 @@ public:
uint8_t* __restrict__ output, uint8_t* __restrict__ output,
ScratchPad** __restrict__ scratchPad) ScratchPad** __restrict__ scratchPad)
{ {
const uint8_t* l; keccak(input, (int) size, scratchPad[0]->state, 200);
uint64_t* h;
uint64_t al;
uint64_t ah;
uint64_t idx;
__m128i bx0;
__m128i bx1;
keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200); const uint8_t* l0 = scratchPad[0]->memory;
uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
l = scratchPad[0]->memory; cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l); uint64_t al0 = h0[0] ^h0[4];
uint64_t ah0 = h0[1] ^h0[5];
al = h[0] ^ h[4]; __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
ah = h[1] ^ h[5]; __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]);
idx = h[0] ^ h[4];
uint64_t division_result_xmm0 = h[12]; uint64_t idx0 = h0[0] ^h0[4];
uint64_t sqrt_result0 = h[13]; uint64_t division_result_xmm0 = h0[12];
uint64_t sqrt_result0 = h0[13];
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
const __m128i ax = _mm_set_epi64x(ah, al); __m128i cx0;
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
__m128i cx;
if (SOFT_AES) { if (SOFT_AES) {
cx = soft_aesenc((uint32_t*) &l[idx & MASK], ax); cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
} else { } else {
cx = _mm_load_si128((__m128i*) &l[idx & MASK]); cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx = _mm_aesenc_si128(cx, ax); cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
} }
SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
_mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
idx0 = EXTRACT64(cx0);
uint64_t hi, lo, cl, ch; uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l[idx & MASK])[0]; cl = ((uint64_t*) &l0[idx0 & MASK])[0];
ch = ((uint64_t*) &l[idx & MASK])[1]; ch = ((uint64_t*) &l0[idx0 & MASK])[1];
INTEGER_MATH_V2(0, cl, cx) INTEGER_MATH_V2(0, cl, cx0);
lo = __umul128(idx, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
al += hi; al0 += hi;
ah += lo; ah0 += lo;
((uint64_t*) &l[idx & MASK])[0] = al; ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
((uint64_t*) &l[idx & MASK])[1] = ah; ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
ah ^= ch; ah0 ^= ch;
al ^= cl; al0 ^= cl;
idx = al; idx0 = al0;
bx0 = cx; bx10 = bx00;
bx00 = cx0;
} }
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h); cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
keccakf(h, 24);
keccakf(h0, 24);
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
} }
@ -2037,9 +2035,9 @@ public:
uint64_t ah1 = h1[1] ^h1[5]; uint64_t ah1 = h1[1] ^h1[5];
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
__m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
__m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
__m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
uint64_t idx0 = h0[0] ^h0[4]; uint64_t idx0 = h0[0] ^h0[4];
@ -2052,11 +2050,12 @@ public:
uint64_t sqrt_result1 = h1[13]; uint64_t sqrt_result1 = h1[13];
for (size_t i = 0; i < ITERATIONS; i++) { for (size_t i = 0; i < ITERATIONS; i++) {
__m128i cx0;
__m128i cx1;
const __m128i ax0 = _mm_set_epi64x(ah0, al0); const __m128i ax0 = _mm_set_epi64x(ah0, al0);
const __m128i ax1 = _mm_set_epi64x(ah1, al1); const __m128i ax1 = _mm_set_epi64x(ah1, al1);
__m128i cx0;
__m128i cx1;
if (SOFT_AES) { if (SOFT_AES) {
cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0); cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1); cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1);
@ -2064,8 +2063,8 @@ public:
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
cx0 = _mm_aesenc_si128(cx0, ax0); cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
cx1 = _mm_aesenc_si128(cx1, ax1); cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
@ -2077,9 +2076,6 @@ public:
idx0 = EXTRACT64(cx0); idx0 = EXTRACT64(cx0);
idx1 = EXTRACT64(cx1); idx1 = EXTRACT64(cx1);
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
uint64_t hi, lo, cl, ch; uint64_t hi, lo, cl, ch;
cl = ((uint64_t*) &l0[idx0 & MASK])[0]; cl = ((uint64_t*) &l0[idx0 & MASK])[0];
ch = ((uint64_t*) &l0[idx0 & MASK])[1]; ch = ((uint64_t*) &l0[idx0 & MASK])[1];
@ -2103,6 +2099,7 @@ public:
bx10 = bx00; bx10 = bx00;
bx00 = cx0; bx00 = cx0;
cl = ((uint64_t*) &l1[idx1 & MASK])[0]; cl = ((uint64_t*) &l1[idx1 & MASK])[0];
ch = ((uint64_t*) &l1[idx1 & MASK])[1]; ch = ((uint64_t*) &l1[idx1 & MASK])[1];

View file

@ -50,29 +50,34 @@ extern "C"
#include "crypto/c_skein.h" #include "crypto/c_skein.h"
#ifndef XMRIG_NO_ASM #ifndef XMRIG_NO_ASM
void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_sandybridge_asm(ScratchPad* ctx0);
void cn_litev1_mainloop_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_lite_sandybridge_asm(ScratchPad* ctx0);
void cn_fast_mainloop_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_fast_sandybridge_asm(ScratchPad* ctx0);
void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_upx_sandybridge_asm(ScratchPad* ctx0);
void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0);
void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0); void cnv2_main_loop_ivybridge_asm(ScratchPad* ctx0);
void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv2_main_loop_ryzen_asm(ScratchPad* ctx0);
void cn_fastv2_mainloop_ivybridge_asm(ScratchPad* ctx0); void cnv2_main_loop_bulldozer_asm(ScratchPad* ctx0);
void cn_fastv2_mainloop_ryzen_asm(ScratchPad* ctx0); void cnv2_double_main_loop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv2_main_loop_fastv2_ivybridge_asm(ScratchPad* ctx0);
void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0); void cnv2_main_loop_fastv2_ryzen_asm(ScratchPad* ctx0);
void cn_ultralitev2_mainloop_ivybridge_asm(ScratchPad* ctx0); void cnv2_main_loop_fastv2_bulldozer_asm(ScratchPad* ctx0);
void cn_ultralitev2_mainloop_ryzen_asm(ScratchPad* ctx0); void cnv2_double_main_loop_fastv2_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
void cn_ultralitev2_mainloop_bulldozer_asm(ScratchPad* ctx0);
void cn_ultralitev2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv2_main_loop_ultralite_ivybridge_asm(ScratchPad* ctx0);
void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_main_loop_ultralite_ryzen_asm(ScratchPad* ctx0);
void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_main_loop_ultralite_bulldozer_asm(ScratchPad* ctx0);
void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv2_double_main_loop_ultralite_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_lite_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx); void cnv1_main_loop_fast_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cnv1_main_loop_upx_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cnv2_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(ScratchPad* ctx);
#endif #endif
} }
@ -1437,28 +1442,28 @@ public:
if (SOFT_AES) { if (SOFT_AES) {
scratchPad[0]->t_fn = (const uint32_t*)saes_table; scratchPad[0]->t_fn = (const uint32_t*)saes_table;
if (ITERATIONS == 0x80000) { if (ITERATIONS == 0x40000) {
cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
} else if (ITERATIONS == 0x40000) {
if (MASK == 0x1FFFF0) { if (MASK == 0x1FFFF0) {
cn_fast_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); cnv1_main_loop_fast_soft_aes_sandybridge_asm(scratchPad[0]);
} else { } else {
cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); cnv1_main_loop_lite_soft_aes_sandybridge_asm(scratchPad[0]);
}
} else if (ITERATIONS == 0x20000) {
cnv1_main_loop_upx_soft_aes_sandybridge_asm(scratchPad[0]);
} else {
cnv1_main_loop_soft_aes_sandybridge_asm(scratchPad[0]);
} }
} else { } else {
cn_liteupx_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); if (ITERATIONS == 0x40000) {
}
} else {
if (ITERATIONS == 0x80000) {
cnv1_mainloop_sandybridge_asm(scratchPad[0]);
} else if (ITERATIONS == 0x40000) {
if (MASK == 0x1FFFF0) { if (MASK == 0x1FFFF0) {
cn_fast_mainloop_sandybridge_asm(scratchPad[0]); cnv1_main_loop_fast_sandybridge_asm(scratchPad[0]);
} else { } else {
cn_litev1_mainloop_sandybridge_asm(scratchPad[0]); cnv1_main_loop_lite_sandybridge_asm(scratchPad[0]);
} }
} else if (ITERATIONS == 0x20000) {
cnv1_main_loop_upx_sandybridge_asm(scratchPad[0]);
} else { } else {
cn_liteupx_mainloop_sandybridge_asm(scratchPad[0]); cnv1_main_loop_sandybridge_asm(scratchPad[0]);
} }
} }
#endif #endif
@ -1560,36 +1565,36 @@ public:
scratchPad[0]->input = input; scratchPad[0]->input = input;
scratchPad[0]->t_fn = (const uint32_t*)saes_table; scratchPad[0]->t_fn = (const uint32_t*)saes_table;
if (ITERATIONS == 0x40000) { if (ITERATIONS == 0x40000) {
cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(scratchPad[0]);
} else if (ITERATIONS == 0x10000) { } else if (ITERATIONS == 0x10000) {
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(scratchPad[0]);
} else { } else {
cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]); cnv2_main_loop_soft_aes_sandybridge_asm(scratchPad[0]);
} }
} else { } else {
if (ITERATIONS == 0x10000) { if (ITERATIONS == 0x40000) {
cn_ultralitev2_mainloop_ivybridge_asm(scratchPad[0]); cnv2_main_loop_fastv2_ivybridge_asm(scratchPad[0]);
} else if (ITERATIONS == 0x40000) { } else if (ITERATIONS == 0x10000) {
cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]); cnv2_main_loop_ultralite_ivybridge_asm(scratchPad[0]);
} else { } else {
cnv2_mainloop_ivybridge_asm(scratchPad[0]); cnv2_main_loop_ivybridge_asm(scratchPad[0]);
} }
} }
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) { } else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
if (ITERATIONS == 0x10000) { if (ITERATIONS == 0x40000) {
cn_ultralitev2_mainloop_ryzen_asm(scratchPad[0]); cnv2_main_loop_fastv2_ryzen_asm(scratchPad[0]);
} else if (ITERATIONS == 0x40000) { } else if (ITERATIONS == 0x10000) {
cn_fastv2_mainloop_ryzen_asm(scratchPad[0]); cnv2_main_loop_ultralite_ryzen_asm(scratchPad[0]);
} else { } else {
cnv2_mainloop_ryzen_asm(scratchPad[0]); cnv2_main_loop_ryzen_asm(scratchPad[0]);
} }
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) { } else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
if (ITERATIONS == 0x10000) { if (ITERATIONS == 0x40000) {
cn_ultralitev2_mainloop_bulldozer_asm(scratchPad[0]); cnv2_main_loop_fastv2_bulldozer_asm(scratchPad[0]);
} else if (ITERATIONS == 0x40000) { } else if (ITERATIONS == 0x10000) {
cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]); cnv2_main_loop_ultralite_bulldozer_asm(scratchPad[0]);
} else { } else {
cnv2_mainloop_bulldozer_asm(scratchPad[0]); cnv2_main_loop_bulldozer_asm(scratchPad[0]);
} }
} }
#endif #endif
@ -2306,12 +2311,12 @@ public:
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1); cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
#ifndef XMRIG_NO_ASM #ifndef XMRIG_NO_ASM
if (ITERATIONS == 0x10000) { if (ITERATIONS == 0x40000) {
cn_ultralitev2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); cnv2_double_main_loop_fastv2_sandybridge_asm(scratchPad[0], scratchPad[1]);
} else if (ITERATIONS == 0x40000) { } else if (ITERATIONS == 0x10000) {
cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); cnv2_double_main_loop_ultralite_sandybridge_asm(scratchPad[0], scratchPad[1]);
} else { } else {
cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]); cnv2_double_main_loop_sandybridge_asm(scratchPad[0], scratchPad[1]);
} }
#endif #endif

View file

@ -1,166 +0,0 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 2097136
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 262144
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cn_fast_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 2097136
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 2097136
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cn_fast_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -1,180 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 262144
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
cnv2_main_loop_fast2_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movq r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movq xmm0, rax
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_fast2_bulldozer
shr rdi, 19
sqrt_fixup_fast2_bulldozer_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_fast2_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_fast2_bulldozer_endp
sqrt_fixup_fast2_bulldozer:
movq r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_fast2_bulldozer_ret
cnv2_main_loop_fast2_bulldozer_endp:

View file

@ -1,183 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 262144
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
$main_loop_fast2_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11
movq xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movq r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movq rax, xmm0
div r9
movq xmm0, rax
movq xmm1, rdx
punpckldq xmm0, xmm1
movq r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je $sqrt_fixup_fast2_ryzen
shr rdi, 19
$sqrt_fixup_fast2_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne $main_loop_fast2_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp $cnv2_main_loop_fast2_ryzen_endp
$sqrt_fixup_fast2_ryzen:
movq r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp $sqrt_fixup_fast2_ryzen_ret
$cnv2_main_loop_fast2_ryzen_endp:

View file

@ -1,271 +0,0 @@
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 152
stmxcsr DWORD PTR [rsp+4]
mov DWORD PTR [rsp], 24448
ldmxcsr DWORD PTR [rsp]
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+248], rax
movq xmm12, r11
mov QWORD PTR [rsp+240], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 262144
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cnv2_mainloop_soft_aes_fast2_sandybridge:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+248]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
xor edx, edx
mov rax, rcx
shl rax, 32
movq rbx, xmm10
xor rbx, rax
lea r9, QWORD PTR [rcx+rcx]
add r9d, edi
movdqa xmm0, xmm6
pxor xmm0, xmm4
mov ecx, -2147483647
movdqu XMMWORD PTR [r13], xmm0
or r9, rcx
movdqa xmm0, xmm6
movaps xmm1, xmm9
psrldq xmm0, 8
movq rax, xmm0
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
div r9
shl rdx, 32
mov eax, eax
add rdx, rax
lea r9, QWORD PTR [rdx+rdi]
movq xmm10, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm1, xmm0
movq rdx, xmm1
test rdx, 524287
je sqrt_fixup_soft_aes_fast2_sandybridge
psrlq xmm1, 19
sqrt_fixup_soft_aes_fast2_sandybridge_ret:
mov r9, r10
movdqa xmm13, xmm1
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+240]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+224]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+240], r9
mov QWORD PTR [rsp+248], rax
sub r12d, 1
jne cnv2_mainloop_soft_aes_fast2_sandybridge
ldmxcsr DWORD PTR [rsp+4]
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 152
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp
sqrt_fixup_soft_aes_fast2_sandybridge:
dec rdx
mov r15d, -1022
shl r15, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
lea rcx, [rcx+r15+1]
add rax, r15
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm1, rdx
jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret
cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp:

View file

@ -1,74 +0,0 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 131072
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 1048560
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cn_liteupx_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 1048560
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 1048560
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cn_liteupx_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -1,166 +0,0 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 1048560
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 131072
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cn_liteupx_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 1048560
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 1048560
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cn_liteupx_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -1,74 +0,0 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 262144
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 1048560
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cn_litev1_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 1048560
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 1048560
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cn_litev1_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -1,166 +0,0 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 1048560
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 262144
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cn_litev1_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 1048560
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 1048560
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cn_litev1_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -7,40 +7,44 @@
# define FN_PREFIX(fn) fn # define FN_PREFIX(fn) fn
.section .text .section .text
#endif #endif
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_sandybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm)
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm)
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm)
.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_ivybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_ryzen_asm)
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_bulldozer_asm)
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_double_main_loop_sandybridge_asm)
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm)
.global FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm)
.global FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm)
.global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm)
#ifdef __APPLE__ #ifdef __APPLE__
ALIGN 16 ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv1_mainloop_sandybridge_asm): FN_PREFIX(cnv1_main_loop_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cnv1_mainloop_sandybridge.inc" #include "cnv1_main_loop_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -49,10 +53,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_litev1_mainloop_sandybridge_asm): FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_litev1_mainloop_sandybridge.inc" #include "cnv1_main_loop_lite_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -61,10 +65,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_fast_mainloop_sandybridge_asm): FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_fast_mainloop_sandybridge.inc" #include "cnv1_main_loop_fast_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -73,7 +77,19 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv2_mainloop_ivybridge_asm): FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv1_main_loop_upx_sandybridge.inc"
add rsp, 48
ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cnv2_main_loop_ivybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cnv2_main_loop_ivybridge.inc" #include "cnv2_main_loop_ivybridge.inc"
@ -85,7 +101,7 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv2_mainloop_ryzen_asm): FN_PREFIX(cnv2_main_loop_ryzen_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cnv2_main_loop_ryzen.inc" #include "cnv2_main_loop_ryzen.inc"
@ -97,7 +113,7 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv2_mainloop_bulldozer_asm): FN_PREFIX(cnv2_main_loop_bulldozer_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cnv2_main_loop_bulldozer.inc" #include "cnv2_main_loop_bulldozer.inc"
@ -109,7 +125,7 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv2_double_main_loop_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
mov rdx, rsi mov rdx, rsi
@ -122,10 +138,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm): FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_fastv2_main_loop_ivybridge.inc" #include "cnv2_main_loop_fastv2_ivybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -134,10 +150,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_fastv2_mainloop_ryzen_asm): FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_fastv2_main_loop_ryzen.inc" #include "cnv2_main_loop_fastv2_ryzen.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -146,10 +162,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm): FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_fastv2_main_loop_bulldozer.inc" #include "cnv2_main_loop_fastv2_bulldozer.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -158,11 +174,11 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
mov rdx, rsi mov rdx, rsi
#include "cn_fastv2_double_main_loop_sandybridge.inc" #include "cnv2_double_main_loop_fastv2_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -171,10 +187,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_liteupx_mainloop_sandybridge.inc" #include "cnv2_main_loop_ultralite_ivybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -183,10 +199,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm): FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_ultralitev2_main_loop_ivybridge.inc" #include "cnv2_main_loop_ultralite_ryzen.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -195,11 +211,23 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_ultralite_bulldozer.inc"
add rsp, 48
ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
mov rdx, rsi mov rdx, rsi
#include "cn_ultralitev2_double_main_loop_sandybridge.inc" #include "cnv2_double_main_loop_ultralite_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -208,10 +236,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm): FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_ultralitev2_main_loop_ryzen.inc" #include "cnv1_main_loop_soft_aes_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -220,10 +248,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm): FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_ultralitev2_main_loop_bulldozer.inc" #include "cnv1_main_loop_lite_soft_aes_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -232,10 +260,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cnv1_mainloop_soft_aes_sandybridge.inc" #include "cnv1_main_loop_fast_soft_aes_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -244,10 +272,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_litev1_mainloop_soft_aes_sandybridge.inc" #include "cnv1_main_loop_upx_soft_aes_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -256,10 +284,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_fast_mainloop_soft_aes_sandybridge.inc" #include "cnv2_main_loop_soft_aes_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -268,45 +296,22 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cnv2_mainloop_soft_aes_sandybridge.inc" #include "cnv2_main_loop_fastv2_soft_aes_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
#ifdef __APPLE__ #ifdef __APPLE__
ALIGN 16 ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cn_fastv2_mainloop_soft_aes_sandybridge.inc" #include "cnv2_main_loop_ultralite_soft_aes_sandybridge.inc"
add rsp, 48
ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cn_liteupx_mainloop_soft_aes_sandybridge.inc"
add rsp, 48
ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm):
sub rsp, 48
mov rcx, rdi
#include "cn_ultralitev2_mainloop_soft_aes_sandybridge.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0

View file

@ -1,414 +0,0 @@
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 65536
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movq xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movq xmm4, QWORD PTR [r8+96]
and edx, 131056
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movq xmm5, QWORD PTR [r8+104]
movq xmm7, rax
mov eax, 1
shl rax, 52
movq xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movq xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movq xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movq xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movq xmm8, rax
and ecx, 131056
punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
main_loop_double_ultralitev2_sandybridge:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movq xmm0, r11
movq xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movq r11, xmm9
mov edx, r11d
and edx, 131056
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movq xmm0, rbp
movq xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rax+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rcx+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10
and ecx, 131056
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movq rdx, xmm5
shl rdx, 32
movq rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movq xmm0, rdx
xor rdx, [r11+r13]
movq xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
xor r8d, 32
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [r11+r13], xmm0
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 131056
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movq r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movq r11, xmm0
psrldq xmm1, 8
movq r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movq rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movq rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movq r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js div_fix_1_ultralitev2_sandybridge
div_fix_1_ret_ultralitev2_sandybridge:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js div_fix_2_ultralitev2_sandybridge
div_fix_2_ret_ultralitev2_sandybridge:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movq r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je sqrt_fix_1_ultralitev2_sandybridge
sqrt_fix_1_ret_ultralitev2_sandybridge:
movq r9, xmm10
psrldq xmm1, 8
movq r8, xmm1
test r8, 524287
je sqrt_fix_2_ultralitev2_sandybridge
sqrt_fix_2_ret_ultralitev2_sandybridge:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
paddq xmm1, xmm11
paddq xmm3, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm0
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm3
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 131056
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne main_loop_double_ultralitev2_sandybridge
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp
div_fix_1_ultralitev2_sandybridge:
dec rbx
add r11, rdx
jmp div_fix_1_ret_ultralitev2_sandybridge
div_fix_2_ultralitev2_sandybridge:
dec rdx
add r8, r9
jmp div_fix_2_ret_ultralitev2_sandybridge
sqrt_fix_1_ultralitev2_sandybridge:
movq r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movq xmm5, r9
punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_ultralitev2_sandybridge
sqrt_fix_2_ultralitev2_sandybridge:
psrldq xmm3, 8
movq r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movq xmm0, r8
punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_ultralitev2_sandybridge
cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp:

View file

@ -1,180 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 65536
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 131056
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
cnv2_main_loop_ultralitev2_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movq r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 131056
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movq xmm0, rax
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je sqrt_fixup_ultralitev2_bulldozer
shr rdi, 19
sqrt_fixup_ultralitev2_bulldozer_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 131056
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_ultralitev2_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_ultralitev2_bulldozer_endp
sqrt_fixup_ultralitev2_bulldozer:
movq r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_ultralitev2_bulldozer_ret
cnv2_main_loop_ultralitev2_bulldozer_endp:

View file

@ -1,186 +0,0 @@
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 65536
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 131056
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
$main_loop_ultralitev2_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 131056
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
movq rdx, xmm3
test edx, 524287
je $sqrt_fixup_ultralitev2_ivybridge
psrlq xmm3, 19
$sqrt_fixup_ultralitev2_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 131056
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne $main_loop_ultralitev2_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp $cnv2_main_loop_ultralitev2_ivybridge_endp
$sqrt_fixup_ultralitev2_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp $sqrt_fixup_ultralitev2_ivybridge_ret
$cnv2_main_loop_ultralitev2_ivybridge_endp:

View file

@ -1,183 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 65536
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 131056
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
$main_loop_ultralitev2_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11
movq xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movq r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 131056
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movq rax, xmm0
div r9
movq xmm0, rax
movq xmm1, rdx
punpckldq xmm0, xmm1
movq r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je $sqrt_fixup_ultralitev2_ryzen
shr rdi, 19
$sqrt_fixup_ultralitev2_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 131056
movdqa xmm3, xmm5
dec ebp
jne $main_loop_ultralitev2_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp $cnv2_main_loop_ultralitev2_ryzen_endp
$sqrt_fixup_ultralitev2_ryzen:
movq r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp $sqrt_fixup_ultralitev2_ryzen_ret
$cnv2_main_loop_ultralitev2_ryzen_endp:

View file

@ -1,271 +0,0 @@
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 152
stmxcsr DWORD PTR [rsp+4]
mov DWORD PTR [rsp], 24448
ldmxcsr DWORD PTR [rsp]
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 131056
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+248], rax
movq xmm12, r11
mov QWORD PTR [rsp+240], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 262144
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cnv2_mainloop_soft_aes_ultralitev2_sandybridge:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+248]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 131056
xor edx, edx
mov rax, rcx
shl rax, 32
movq rbx, xmm10
xor rbx, rax
lea r9, QWORD PTR [rcx+rcx]
add r9d, edi
movdqa xmm0, xmm6
pxor xmm0, xmm4
mov ecx, -2147483647
movdqu XMMWORD PTR [r13], xmm0
or r9, rcx
movdqa xmm0, xmm6
movaps xmm1, xmm9
psrldq xmm0, 8
movq rax, xmm0
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
div r9
shl rdx, 32
mov eax, eax
add rdx, rax
lea r9, QWORD PTR [rdx+rdi]
movq xmm10, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm1, xmm0
movq rdx, xmm1
test rdx, 524287
je sqrt_fixup_soft_aes_ultralitev2_sandybridge
psrlq xmm1, 19
sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret:
mov r9, r10
movdqa xmm13, xmm1
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+240]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+224]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 131056
xor r9, rbp
mov QWORD PTR [rsp+240], r9
mov QWORD PTR [rsp+248], rax
sub r12d, 1
jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge
ldmxcsr DWORD PTR [rsp+4]
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 152
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp
sqrt_fixup_soft_aes_ultralitev2_sandybridge:
dec rdx
mov r15d, -1022
shl r15, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
lea rcx, [rcx+r15+1]
add rax, r15
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm1, rdx
jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret
cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp:

View file

@ -5,7 +5,7 @@
push r14 push r14
push r15 push r15
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov ebp, 262144 mov ebp, ${ITERATIONS}
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56] mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24] xor rdx, QWORD PTR [rcx+24]
@ -18,7 +18,7 @@
xor rdi, QWORD PTR [rcx+8] xor rdi, QWORD PTR [rcx+8]
mov rdx, r8 mov rdx, r8
mov r15, QWORD PTR [rcx+264] mov r15, QWORD PTR [rcx+264]
and edx, 2097136 and edx, ${MASK}
mov r14, QWORD PTR [rax+35] mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192] xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224] mov rsi, QWORD PTR [rcx+224]
@ -30,14 +30,14 @@
#else #else
ALIGN 64 ALIGN 64
#endif #endif
cn_fast_mainloop_sandybridge: cnv1_main_loop_${ALGO}_sandybridge:
movq xmm0, rdi movq xmm0, rdi
movq xmm1, r8 movq xmm1, r8
punpcklqdq xmm1, xmm0 punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1 aesenc xmm2, xmm1
movq r10, xmm2 movq r10, xmm2
mov r9d, r10d mov r9d, r10d
and r9d, 2097136 and r9d, ${MASK}
add r9, rsi add r9, rsi
movdqa xmm0, xmm2 movdqa xmm0, xmm2
pxor xmm0, xmm3 pxor xmm0, xmm3
@ -60,11 +60,11 @@ cn_fast_mainloop_sandybridge:
mov QWORD PTR [r9+8], rax mov QWORD PTR [r9+8], rax
xor r8, rbx xor r8, rbx
mov rdx, r8 mov rdx, r8
and edx, 2097136 and edx, ${MASK}
movdqu xmm2, XMMWORD PTR [rdx+rsi] movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11 xor rdi, r11
dec ebp dec ebp
jne cn_fast_mainloop_sandybridge jne cnv1_main_loop_${ALGO}_sandybridge
mov rbx, QWORD PTR [rsp+24] mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32] mov rbp, QWORD PTR [rsp+32]

View file

@ -26,7 +26,7 @@
xor r13, QWORD PTR [rcx+8] xor r13, QWORD PTR [rcx+8]
mov rdx, r8 mov rdx, r8
mov rdi, QWORD PTR [rcx+224] mov rdi, QWORD PTR [rcx+224]
and edx, 2097136 and edx, ${MASK}
mov rax, QWORD PTR [rax+35] mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192] xor rax, QWORD PTR [rcx+192]
movq xmm5, rax movq xmm5, rax
@ -38,14 +38,14 @@
mov rax, QWORD PTR [rcx+264] mov rax, QWORD PTR [rcx+264]
movq xmm7, rax movq xmm7, rax
mov eax, 524288 mov eax, ${ITERATIONS}
#ifdef __APPLE__ #ifdef __APPLE__
ALIGN 16 ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
cnv1_mainloop_soft_aes_sandybridge: cnv1_main_loop_${ALGO}_soft_aes_sandybridge:
movq xmm9, rax movq xmm9, rax
mov r12, QWORD PTR [rcx+272] mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi] mov esi, DWORD PTR [rdx+rdi]
@ -118,7 +118,7 @@ cnv1_mainloop_soft_aes_sandybridge:
pxor xmm3, xmm1 pxor xmm3, xmm1
movq r9, xmm3 movq r9, xmm3
mov r10d, r9d mov r10d, r9d
and r10d, 2097136 and r10d, ${MASK}
movdqa xmm0, xmm3 movdqa xmm0, xmm3
pxor xmm0, xmm4 pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0 movdqu XMMWORD PTR [rdx+rdi], xmm0
@ -145,10 +145,10 @@ cnv1_mainloop_soft_aes_sandybridge:
movq rax, xmm9 movq rax, xmm9
mov rdx, r8 mov rdx, r8
xor r13, r11 xor r13, r11
and edx, 2097136 and edx, ${MASK}
mov QWORD PTR [rsp+64], rdx mov QWORD PTR [rsp+64], rdx
sub eax, 1 sub eax, 1
jne cnv1_mainloop_soft_aes_sandybridge jne cnv1_main_loop_${ALGO}_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16] movaps xmm7, XMMWORD PTR [rsp+16]

View file

@ -1,74 +0,0 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 524288
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 2097136
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
cnv1_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 2097136
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 2097136
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cnv1_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -1,414 +0,0 @@
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 524288
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movq xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movq xmm4, QWORD PTR [r8+96]
and edx, 2097136
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movq xmm5, QWORD PTR [r8+104]
movq xmm7, rax
mov eax, 1
shl rax, 52
movq xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movq xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movq xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movq xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movq xmm8, rax
and ecx, 2097136
punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
main_loop_double_sandybridge:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movq xmm0, r11
movq xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movq r11, xmm9
mov edx, r11d
and edx, 2097136
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movq xmm0, rbp
movq xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rax+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rcx+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10
and ecx, 2097136
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movq rdx, xmm5
shl rdx, 32
movq rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movq xmm0, rdx
xor rdx, [r11+r13]
movq xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
xor r8d, 32
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [r11+r13], xmm0
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 2097136
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movq r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movq r11, xmm0
psrldq xmm1, 8
movq r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movq rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movq rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movq r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js div_fix_1_sandybridge
div_fix_1_ret_sandybridge:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js div_fix_2_sandybridge
div_fix_2_ret_sandybridge:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movq r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je sqrt_fix_1_sandybridge
sqrt_fix_1_ret_sandybridge:
movq r9, xmm10
psrldq xmm1, 8
movq r8, xmm1
test r8, 524287
je sqrt_fix_2_sandybridge
sqrt_fix_2_ret_sandybridge:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
paddq xmm1, xmm11
paddq xmm3, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm0
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm3
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 2097136
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne main_loop_double_sandybridge
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_double_mainloop_asm_sandybridge_endp
div_fix_1_sandybridge:
dec rbx
add r11, rdx
jmp div_fix_1_ret_sandybridge
div_fix_2_sandybridge:
dec rdx
add r8, r9
jmp div_fix_2_ret_sandybridge
sqrt_fix_1_sandybridge:
movq r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movq xmm5, r9
punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_sandybridge
sqrt_fix_2_sandybridge:
psrldq xmm3, 8
movq r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movq xmm0, r8
punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_sandybridge
cnv2_double_mainloop_asm_sandybridge_endp:

View file

@ -18,7 +18,7 @@
mov r10, QWORD PTR [rcx+32] mov r10, QWORD PTR [rcx+32]
mov r8, rcx mov r8, rcx
xor r10, QWORD PTR [rcx] xor r10, QWORD PTR [rcx]
mov r14d, 262144 mov r14d, 524288
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8] xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224] mov rsi, QWORD PTR [rdx+224]
@ -99,7 +99,7 @@
#else #else
ALIGN 64 ALIGN 64
#endif #endif
main_loop_double_fast2_sandybridge: cnv2_double_main_loop_${ALGO}_sandybridge:
movdqu xmm9, xmm15 movdqu xmm9, xmm15
mov eax, edx mov eax, edx
mov ebx, edx mov ebx, edx
@ -253,8 +253,8 @@ main_loop_double_fast2_sandybridge:
mov rbx, rax mov rbx, rax
imul rax, rdx imul rax, rdx
sub r11, rax sub r11, rax
js div_fix_1_fast2_sandybridge js div_fix_1_${ALGO}_sandybridge
div_fix_1_ret_fast2_sandybridge: div_fix_1_ret_${ALGO}_sandybridge:
cvttsd2si rdx, xmm2 cvttsd2si rdx, xmm2
mov rax, rdx mov rax, rdx
@ -262,8 +262,8 @@ div_fix_1_ret_fast2_sandybridge:
movd xmm2, r11d movd xmm2, r11d
movd xmm4, ebx movd xmm4, ebx
sub r8, rax sub r8, rax
js div_fix_2_fast2_sandybridge js div_fix_2_${ALGO}_sandybridge
div_fix_2_ret_fast2_sandybridge: div_fix_2_ret_${ALGO}_sandybridge:
movd xmm1, r8d movd xmm1, r8d
movd xmm0, edx movd xmm0, edx
@ -279,15 +279,15 @@ div_fix_2_ret_fast2_sandybridge:
movdqa xmm5, xmm1 movdqa xmm5, xmm1
psrlq xmm5, 19 psrlq xmm5, 19
test r9, 524287 test r9, 524287
je sqrt_fix_1_fast2_sandybridge je sqrt_fix_1_${ALGO}_sandybridge
sqrt_fix_1_ret_fast2_sandybridge: sqrt_fix_1_ret_${ALGO}_sandybridge:
movq r9, xmm10 movq r9, xmm10
psrldq xmm1, 8 psrldq xmm1, 8
movq r8, xmm1 movq r8, xmm1
test r8, 524287 test r8, 524287
je sqrt_fix_2_fast2_sandybridge je sqrt_fix_2_${ALGO}_sandybridge
sqrt_fix_2_ret_fast2_sandybridge: sqrt_fix_2_ret_${ALGO}_sandybridge:
mov r12d, ecx mov r12d, ecx
mov r8d, ecx mov r8d, ecx
@ -335,7 +335,7 @@ sqrt_fix_2_ret_fast2_sandybridge:
movdqa xmm6, xmm10 movdqa xmm6, xmm10
mov r9, r15 mov r9, r15
dec r14d dec r14d
jne main_loop_double_fast2_sandybridge jne cnv2_double_main_loop_${ALGO}_sandybridge
ldmxcsr DWORD PTR [rsp+272] ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48] movaps xmm13, XMMWORD PTR [rsp+48]
@ -358,19 +358,19 @@ sqrt_fix_2_ret_fast2_sandybridge:
pop rsi pop rsi
pop rbp pop rbp
pop rbx pop rbx
jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp jmp cnv2_double_main_loop_${ALGO}_sandybridge_endp
div_fix_1_fast2_sandybridge: div_fix_1_${ALGO}_sandybridge:
dec rbx dec rbx
add r11, rdx add r11, rdx
jmp div_fix_1_ret_fast2_sandybridge jmp div_fix_1_ret_${ALGO}_sandybridge
div_fix_2_fast2_sandybridge: div_fix_2_${ALGO}_sandybridge:
dec rdx dec rdx
add r8, r9 add r8, r9
jmp div_fix_2_ret_fast2_sandybridge jmp div_fix_2_ret_${ALGO}_sandybridge
sqrt_fix_1_fast2_sandybridge: sqrt_fix_1_${ALGO}_sandybridge:
movq r8, xmm3 movq r8, xmm3
movdqa xmm0, xmm5 movdqa xmm0, xmm5
psrldq xmm0, 8 psrldq xmm0, 8
@ -389,9 +389,9 @@ sqrt_fix_1_fast2_sandybridge:
adc r9, 0 adc r9, 0
movq xmm5, r9 movq xmm5, r9
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_fast2_sandybridge jmp sqrt_fix_1_ret_${ALGO}_sandybridge
sqrt_fix_2_fast2_sandybridge: sqrt_fix_2_${ALGO}_sandybridge:
psrldq xmm3, 8 psrldq xmm3, 8
movq r11, xmm3 movq r11, xmm3
dec r8 dec r8
@ -409,6 +409,6 @@ sqrt_fix_2_fast2_sandybridge:
adc r8, 0 adc r8, 0
movq xmm0, r8 movq xmm0, r8
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_fast2_sandybridge jmp sqrt_fix_2_ret_${ALGO}_sandybridge
cnv2_double_mainloop_asm_fast2_sandybridge_endp: cnv2_double_main_loop_${ALGO}_sandybridge_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov ebp, 524288 mov ebp, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
@ -31,7 +31,7 @@
mov rcx, QWORD PTR [rcx+88] mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72] xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104] mov rdi, QWORD PTR [r9+104]
and r10d, 2097136 and r10d, ${MASK}
movaps XMMWORD PTR [rsp+48], xmm6 movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7 movaps XMMWORD PTR [rsp+32], xmm7
@ -46,7 +46,7 @@
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
ALIGN 16 ALIGN 16
cnv2_main_loop_bulldozer: cnv2_main_loop_${ALGO}_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx] movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm6, r8 movq xmm6, r8
pinsrq xmm6, r11, 1 pinsrq xmm6, r11, 1
@ -83,7 +83,7 @@ cnv2_main_loop_bulldozer:
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm3 pxor xmm0, xmm3
mov r10, r14 mov r10, r14
and r10d, 2097136 and r10d, ${MASK}
movdqa XMMWORD PTR [rdx], xmm0 movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx] xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx]
@ -103,10 +103,10 @@ cnv2_main_loop_bulldozer:
sqrtsd xmm1, xmm0 sqrtsd xmm1, xmm0
movq rdi, xmm1 movq rdi, xmm1
test rdi, 524287 test rdi, 524287
je sqrt_fixup_bulldozer je sqrt_fixup_${ALGO}_bulldozer
shr rdi, 19 shr rdi, 19
sqrt_fixup_bulldozer_ret: sqrt_fixup_${ALGO}_bulldozer_ret:
mov rax, rsi mov rax, rsi
mul r14 mul r14
movq xmm1, rax movq xmm1, rax
@ -138,10 +138,10 @@ sqrt_fixup_bulldozer_ret:
mov QWORD PTR [r12+8], r11 mov QWORD PTR [r12+8], r11
mov r10, r8 mov r10, r8
xor r11, r13 xor r11, r13
and r10d, 2097136 and r10d, ${MASK}
movdqa xmm3, xmm5 movdqa xmm3, xmm5
dec ebp dec ebp
jne cnv2_main_loop_bulldozer jne cnv2_main_loop_${ALGO}_bulldozer
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48] movaps xmm6, XMMWORD PTR [rsp+48]
@ -157,9 +157,9 @@ sqrt_fixup_bulldozer_ret:
pop r13 pop r13
pop r12 pop r12
pop rdi pop rdi
jmp cnv2_main_loop_bulldozer_endp jmp cnv2_main_loop_${ALGO}_bulldozer_endp
sqrt_fixup_bulldozer: sqrt_fixup_${ALGO}_bulldozer:
movq r9, xmm5 movq r9, xmm5
add r9, r15 add r9, r15
dec rdi dec rdi
@ -175,6 +175,6 @@ sqrt_fixup_bulldozer:
imul rcx, rax imul rcx, rax
sub rcx, r9 sub rcx, r9
adc rdi, 0 adc rdi, 0
jmp sqrt_fixup_bulldozer_ret jmp sqrt_fixup_${ALGO}_bulldozer_ret
cnv2_main_loop_bulldozer_endp: cnv2_main_loop_${ALGO}_bulldozer_endp:

View file

@ -1,186 +0,0 @@
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 524288
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
$main_loop_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
movq rdx, xmm3
test edx, 524287
je $sqrt_fixup_ivybridge
psrlq xmm3, 19
$sqrt_fixup_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne $main_loop_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp $cnv2_main_loop_ivybridge_endp
$sqrt_fixup_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp $sqrt_fixup_ivybridge_ret
$cnv2_main_loop_ivybridge_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov esi, 262144 mov esi, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647 mov r13d, -2147483647
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
@ -35,7 +35,7 @@
movaps XMMWORD PTR [rsp+64], xmm6 movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7 movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8 movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136 and r10d, ${MASK}
movq xmm5, rax movq xmm5, rax
mov ax, 1023 mov ax, 1023
@ -52,7 +52,7 @@
#else #else
ALIGN 64 ALIGN 64
#endif #endif
$main_loop_fast2_ivybridge: cnv2_main_loop_${ALGO}_ivybridge:
lea rdx, QWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d mov ecx, r10d
mov eax, r10d mov eax, r10d
@ -66,7 +66,7 @@ $main_loop_fast2_ivybridge:
aesenc xmm6, xmm7 aesenc xmm6, xmm7
movq rbp, xmm6 movq rbp, xmm6
mov r9, rbp mov r9, rbp
and r9d, 2097136 and r9d, ${MASK}
movdqu xmm2, XMMWORD PTR [rcx+rbx] movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx] movdqu xmm0, XMMWORD PTR [r10+rbx]
@ -109,9 +109,9 @@ $main_loop_fast2_ivybridge:
sqrtsd xmm3, xmm0 sqrtsd xmm3, xmm0
movq rdx, xmm3 movq rdx, xmm3
test edx, 524287 test edx, 524287
je $sqrt_fixup_fast2_ivybridge je sqrt_fixup_${ALGO}_ivybridge
psrlq xmm3, 19 psrlq xmm3, 19
$sqrt_fixup_fast2_ivybridge_ret: sqrt_fixup_${ALGO}_ivybridge_ret:
mov ecx, r10d mov ecx, r10d
mov rax, rdi mov rax, rdi
@ -122,7 +122,7 @@ $sqrt_fixup_fast2_ivybridge_ret:
mov QWORD PTR [r14], r8 mov QWORD PTR [r14], r8
xor r8, rdi xor r8, rdi
mov edi, r8d mov edi, r8d
and edi, 2097136 and edi, ${MASK}
movq xmm0, rax movq xmm0, rax
xor rax, [rcx+rbx+8] xor rax, [rcx+rbx+8]
add r11, rax add r11, rax
@ -147,7 +147,7 @@ $sqrt_fixup_fast2_ivybridge_ret:
mov r10d, edi mov r10d, edi
xor r11, r12 xor r11, r12
dec rsi dec rsi
jne $main_loop_fast2_ivybridge jne cnv2_main_loop_${ALGO}_ivybridge
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160] mov rbx, QWORD PTR [rsp+160]
@ -162,9 +162,9 @@ $sqrt_fixup_fast2_ivybridge_ret:
pop rdi pop rdi
pop rsi pop rsi
pop rbp pop rbp
jmp $cnv2_main_loop_fast2_ivybridge_endp jmp cnv2_main_loop_${ALGO}_ivybridge_endp
$sqrt_fixup_fast2_ivybridge: sqrt_fixup_${ALGO}_ivybridge:
dec rdx dec rdx
mov r13d, -1022 mov r13d, -1022
shl r13, 32 shl r13, 32
@ -181,6 +181,6 @@ $sqrt_fixup_fast2_ivybridge:
sub rcx, r9 sub rcx, r9
adc rdx, 0 adc rdx, 0
movq xmm3, rdx movq xmm3, rdx
jmp $sqrt_fixup_fast2_ivybridge_ret jmp sqrt_fixup_${ALGO}_ivybridge_ret
$cnv2_main_loop_fast2_ivybridge_endp: cnv2_main_loop_${ALGO}_ivybridge_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov ebp, 524288 mov ebp, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
@ -31,7 +31,7 @@
mov rcx, QWORD PTR [rcx+88] mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72] xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104] mov rdi, QWORD PTR [r9+104]
and r10d, 2097136 and r10d, ${MASK}
movaps XMMWORD PTR [rsp+48], xmm6 movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7 movaps XMMWORD PTR [rsp+32], xmm7
@ -50,7 +50,7 @@
#else #else
ALIGN 64 ALIGN 64
#endif #endif
$main_loop_ryzen: cnv2_main_loop_${ALGO}_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx] movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11 movq xmm0, r11
movq xmm6, r8 movq xmm6, r8
@ -82,7 +82,7 @@ $main_loop_ryzen:
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm3 pxor xmm0, xmm3
mov r10, r14 mov r10, r14
and r10d, 2097136 and r10d, ${MASK}
movdqa XMMWORD PTR [rdx], xmm0 movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx] xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx]
@ -107,10 +107,10 @@ $main_loop_ryzen:
sqrtsd xmm1, xmm0 sqrtsd xmm1, xmm0
movq rdi, xmm1 movq rdi, xmm1
test rdi, 524287 test rdi, 524287
je $sqrt_fixup_ryzen je sqrt_fixup_${ALGO}_ryzen
shr rdi, 19 shr rdi, 19
$sqrt_fixup_ryzen_ret: sqrt_fixup_${ALGO}_ryzen_ret:
mov rax, rsi mov rax, rsi
mul r14 mul r14
movq xmm1, rax movq xmm1, rax
@ -142,10 +142,10 @@ $sqrt_fixup_ryzen_ret:
mov QWORD PTR [r12+8], r11 mov QWORD PTR [r12+8], r11
mov r10, r8 mov r10, r8
xor r11, r13 xor r11, r13
and r10d, 2097136 and r10d, ${MASK}
movdqa xmm3, xmm5 movdqa xmm3, xmm5
dec ebp dec ebp
jne $main_loop_ryzen jne cnv2_main_loop_${ALGO}_ryzen
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48] movaps xmm6, XMMWORD PTR [rsp+48]
@ -161,9 +161,9 @@ $sqrt_fixup_ryzen_ret:
pop r13 pop r13
pop r12 pop r12
pop rdi pop rdi
jmp $cnv2_main_loop_ryzen_endp jmp cnv2_main_loop_${ALGO}_ryzen_endp
$sqrt_fixup_ryzen: sqrt_fixup_${ALGO}_ryzen:
movq r9, xmm2 movq r9, xmm2
dec rdi dec rdi
mov edx, -1022 mov edx, -1022
@ -178,6 +178,6 @@ $sqrt_fixup_ryzen:
imul rcx, rax imul rcx, rax
sub rcx, r9 sub rcx, r9
adc rdi, 0 adc rdi, 0
jmp $sqrt_fixup_ryzen_ret jmp sqrt_fixup_${ALGO}_ryzen_ret
$cnv2_main_loop_ryzen_endp: cnv2_main_loop_${ALGO}_ryzen_endp:

View file

@ -47,7 +47,7 @@
mov rax, r8 mov rax, r8
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
and eax, 2097136 and eax, ${MASK}
movq xmm10, QWORD PTR [r10+96] movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx movq xmm0, rcx
mov rcx, QWORD PTR [r10+104] mov rcx, QWORD PTR [r10+104]
@ -57,14 +57,14 @@
mov QWORD PTR [rsp+240], r9 mov QWORD PTR [rsp+240], r9
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
movq xmm13, rcx movq xmm13, rcx
mov r12d, 524288 mov r12d, ${ITERATIONS}
#ifdef __APPLE__ #ifdef __APPLE__
ALIGN 16 ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
cnv2_mainloop_soft_aes_sandybridge: cnv2_main_loop_${ALGO}_soft_aes_sandybridge:
movd xmm11, r12d movd xmm11, r12d
mov r12, QWORD PTR [r10+272] mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11] lea r13, QWORD PTR [rax+r11]
@ -152,7 +152,7 @@ cnv2_mainloop_soft_aes_sandybridge:
movdqu XMMWORD PTR [rdx+r11], xmm1 movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6 movq rdi, xmm6
mov r10, rdi mov r10, rdi
and r10d, 2097136 and r10d, ${MASK}
xor edx, edx xor edx, edx
mov rax, rcx mov rax, rcx
shl rax, 32 shl rax, 32
@ -185,9 +185,9 @@ cnv2_mainloop_soft_aes_sandybridge:
sqrtsd xmm1, xmm0 sqrtsd xmm1, xmm0
movq rdx, xmm1 movq rdx, xmm1
test rdx, 524287 test rdx, 524287
je sqrt_fixup_soft_aes_sandybridge je sqrt_fixup_${ALGO}_soft_aes_sandybridge
psrlq xmm1, 19 psrlq xmm1, 19
sqrt_fixup_soft_aes_sandybridge_ret: sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret:
mov r9, r10 mov r9, r10
movdqa xmm13, xmm1 movdqa xmm13, xmm1
@ -223,12 +223,12 @@ sqrt_fixup_soft_aes_sandybridge_ret:
xor r8, rbx xor r8, rbx
mov rax, r8 mov rax, r8
mov QWORD PTR [r14+8], r9 mov QWORD PTR [r14+8], r9
and eax, 2097136 and eax, ${MASK}
xor r9, rbp xor r9, rbp
mov QWORD PTR [rsp+240], r9 mov QWORD PTR [rsp+240], r9
mov QWORD PTR [rsp+248], rax mov QWORD PTR [rsp+248], rax
sub r12d, 1 sub r12d, 1
jne cnv2_mainloop_soft_aes_sandybridge jne cnv2_main_loop_${ALGO}_soft_aes_sandybridge
ldmxcsr DWORD PTR [rsp+4] ldmxcsr DWORD PTR [rsp+4]
movaps xmm6, XMMWORD PTR [rsp+16] movaps xmm6, XMMWORD PTR [rsp+16]
@ -249,9 +249,9 @@ sqrt_fixup_soft_aes_sandybridge_ret:
pop rsi pop rsi
pop rbp pop rbp
pop rbx pop rbx
jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp jmp cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp
sqrt_fixup_soft_aes_sandybridge: sqrt_fixup_${ALGO}_soft_aes_sandybridge:
dec rdx dec rdx
mov r15d, -1022 mov r15d, -1022
shl r15, 32 shl r15, 32
@ -266,6 +266,6 @@ sqrt_fixup_soft_aes_sandybridge:
sub rcx, r9 sub rcx, r9
adc rdx, 0 adc rdx, 0
movq xmm1, rdx movq xmm1, rdx
jmp sqrt_fixup_soft_aes_sandybridge_ret jmp sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret
cnv2_mainloop_soft_aes_sandybridge_asm_endp: cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp:

View file

@ -1,70 +0,0 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 262144
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 2097136
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
ALIGN 64
cn_fast_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 2097136
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 2097136
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cn_fast_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -1,162 +0,0 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 2097136
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 262144
ALIGN 64
cn_fast_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 2097136
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 2097136
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cn_fast_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -1,410 +0,0 @@
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 262144
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movq xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movq xmm4, QWORD PTR [r8+96]
and edx, 2097136
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movq xmm5, QWORD PTR [r8+104]
movq xmm7, rax
mov eax, 1
shl rax, 52
movq xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movq xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movq xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movq xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movq xmm8, rax
and ecx, 2097136
punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
ALIGN 64
main_loop_double_fast2_sandybridge:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movq xmm0, r11
movq xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movq r11, xmm9
mov edx, r11d
and edx, 2097136
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movq xmm0, rbp
movq xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rax+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rcx+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10
and ecx, 2097136
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movq rdx, xmm5
shl rdx, 32
movq rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movq xmm0, rdx
xor rdx, [r11+r13]
movq xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
xor r8d, 32
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [r11+r13], xmm0
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 2097136
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movq r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movq r11, xmm0
psrldq xmm1, 8
movq r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movq rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movq rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movq r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js div_fix_1_fast2_sandybridge
div_fix_1_ret_fast2_sandybridge:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js div_fix_2_fast2_sandybridge
div_fix_2_ret_fast2_sandybridge:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movq r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je sqrt_fix_1_fast2_sandybridge
sqrt_fix_1_ret_fast2_sandybridge:
movq r9, xmm10
psrldq xmm1, 8
movq r8, xmm1
test r8, 524287
je sqrt_fix_2_fast2_sandybridge
sqrt_fix_2_ret_fast2_sandybridge:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
paddq xmm1, xmm11
paddq xmm3, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm0
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm3
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 2097136
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne main_loop_double_fast2_sandybridge
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp
div_fix_1_fast2_sandybridge:
dec rbx
add r11, rdx
jmp div_fix_1_ret_fast2_sandybridge
div_fix_2_fast2_sandybridge:
dec rdx
add r8, r9
jmp div_fix_2_ret_fast2_sandybridge
sqrt_fix_1_fast2_sandybridge:
movq r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movq xmm5, r9
punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_fast2_sandybridge
sqrt_fix_2_fast2_sandybridge:
psrldq xmm3, 8
movq r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movq xmm0, r8
punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_fast2_sandybridge
cnv2_double_mainloop_asm_fast2_sandybridge_endp:

View file

@ -1,180 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 262144
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movd xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movd xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movd xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
cnv2_main_loop_fast2_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movd xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movd r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movd xmm0, rax
sqrtsd xmm1, xmm0
movd rdi, xmm1
test rdi, 524287
je sqrt_fixup_fast2_bulldozer
shr rdi, 19
sqrt_fixup_fast2_bulldozer_ret:
mov rax, rsi
mul r14
movd xmm1, rax
movd xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_fast2_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_fast2_bulldozer_endp
sqrt_fixup_fast2_bulldozer:
movd r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_fast2_bulldozer_ret
cnv2_main_loop_fast2_bulldozer_endp:

View file

@ -1,182 +0,0 @@
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 262144
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN 64
$main_loop_fast2_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 2097136
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
movq rdx, xmm3
test edx, 524287
je $sqrt_fixup_fast2_ivybridge
psrlq xmm3, 19
$sqrt_fixup_fast2_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 2097136
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne $main_loop_fast2_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp $cnv2_main_loop_fast2_ivybridge_endp
$sqrt_fixup_fast2_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp $sqrt_fixup_fast2_ivybridge_ret
$cnv2_main_loop_fast2_ivybridge_endp:

View file

@ -1,179 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 262144
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 2097136
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 64
$main_loop_fast2_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11
movq xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movq r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 2097136
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movq rax, xmm0
div r9
movq xmm0, rax
movq xmm1, rdx
punpckldq xmm0, xmm1
movq r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je $sqrt_fixup_fast2_ryzen
shr rdi, 19
$sqrt_fixup_fast2_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 2097136
movdqa xmm3, xmm5
dec ebp
jne $main_loop_fast2_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp $cnv2_main_loop_fast2_ryzen_endp
$sqrt_fixup_fast2_ryzen:
movq r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp $sqrt_fixup_fast2_ryzen_ret
$cnv2_main_loop_fast2_ryzen_endp:

View file

@ -1,267 +0,0 @@
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 152
stmxcsr DWORD PTR [rsp+4]
mov DWORD PTR [rsp], 24448
ldmxcsr DWORD PTR [rsp]
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 2097136
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+248], rax
movq xmm12, r11
mov QWORD PTR [rsp+240], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 262144
ALIGN 64
cnv2_mainloop_soft_aes_fast2_sandybridge:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+248]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 2097136
xor edx, edx
mov rax, rcx
shl rax, 32
movq rbx, xmm10
xor rbx, rax
lea r9, QWORD PTR [rcx+rcx]
add r9d, edi
movdqa xmm0, xmm6
pxor xmm0, xmm4
mov ecx, -2147483647
movdqu XMMWORD PTR [r13], xmm0
or r9, rcx
movdqa xmm0, xmm6
movaps xmm1, xmm9
psrldq xmm0, 8
movq rax, xmm0
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
div r9
shl rdx, 32
mov eax, eax
add rdx, rax
lea r9, QWORD PTR [rdx+rdi]
movq xmm10, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm1, xmm0
movq rdx, xmm1
test rdx, 524287
je sqrt_fixup_soft_aes_fast2_sandybridge
psrlq xmm1, 19
sqrt_fixup_soft_aes_fast2_sandybridge_ret:
mov r9, r10
movdqa xmm13, xmm1
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+240]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+224]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 2097136
xor r9, rbp
mov QWORD PTR [rsp+240], r9
mov QWORD PTR [rsp+248], rax
sub r12d, 1
jne cnv2_mainloop_soft_aes_fast2_sandybridge
ldmxcsr DWORD PTR [rsp+4]
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 152
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp
sqrt_fixup_soft_aes_fast2_sandybridge:
dec rdx
mov r15d, -1022
shl r15, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
lea rcx, [rcx+r15+1]
add rax, r15
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm1, rdx
jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret
cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp:

View file

@ -1,70 +0,0 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 131072
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 1048560
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
ALIGN 64
cn_litev1_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 1048560
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 1048560
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cn_litev1_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -1,162 +0,0 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 1048560
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 131072
ALIGN 64
cn_litev1_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 1048560
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 1048560
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cn_litev1_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -1,162 +0,0 @@
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 72
movaps XMMWORD PTR [rsp], xmm6
movaps XMMWORD PTR [rsp+16], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
movaps XMMWORD PTR [rsp+48], xmm9
mov rax, QWORD PTR [rcx+48]
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm4, rax
mov rax, QWORD PTR [rcx+256]
mov r13, QWORD PTR [rcx+40]
movq xmm0, rdx
xor r13, QWORD PTR [rcx+8]
mov rdx, r8
mov rdi, QWORD PTR [rcx+224]
and edx, 1048560
mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192]
movq xmm5, rax
movq xmm8, rdi
punpcklqdq xmm4, xmm0
mov QWORD PTR [rsp+64], rdx
movq xmm6, rcx
mov rax, QWORD PTR [rcx+264]
movq xmm7, rax
mov eax, 262144
ALIGN 64
cn_litev1_mainloop_soft_aes_sandybridge:
movq xmm9, rax
mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi]
mov r10d, DWORD PTR [rdx+rdi+4]
mov ebp, DWORD PTR [rdx+rdi+12]
mov r14d, DWORD PTR [rdx+rdi+8]
mov rdx, QWORD PTR [rsp+64]
movzx ecx, sil
shr esi, 8
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
add ebp, 256
movd xmm1, r11d
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movq rdi, xmm8
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
punpckldq xmm2, xmm1
movq xmm1, r8
xor eax, DWORD PTR [r12+rcx*4]
xor eax, r15d
movd xmm3, eax
movq rax, xmm7
punpckldq xmm3, xmm0
movq xmm0, r13
punpcklqdq xmm1, xmm0
punpckldq xmm3, xmm2
pxor xmm3, xmm1
movq r9, xmm3
mov r10d, r9d
and r10d, 1048560
movdqa xmm0, xmm3
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0
psrldq xmm0, 11
movq rcx, xmm0
movzx ecx, cl
mov cl, BYTE PTR [rcx+rax]
mov BYTE PTR [rdi+rdx+11], cl
mov rbx, QWORD PTR [r10+rdi]
mov rcx, r9
lea r9, QWORD PTR [r10+rdi]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
movdqa xmm4, xmm3
mul rcx
movq rcx, xmm6
add r8, rdx
add r13, rax
movq rax, xmm5
xor rax, r13
mov QWORD PTR [r9], r8
xor r8, rbx
mov QWORD PTR [r9+8], rax
movq rax, xmm9
mov rdx, r8
xor r13, r11
and edx, 1048560
mov QWORD PTR [rsp+64], rdx
sub eax, 1
jne cn_litev1_mainloop_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16]
movaps xmm8, XMMWORD PTR [rsp+32]
movaps xmm9, XMMWORD PTR [rsp+48]
add rsp, 72
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx

View file

@ -1,166 +1,171 @@
_TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE _TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE
PUBLIC cnv1_mainloop_sandybridge_asm
PUBLIC cn_litev1_mainloop_sandybridge_asm
PUBLIC cn_fast_mainloop_sandybridge_asm
PUBLIC cnv2_mainloop_ivybridge_asm
PUBLIC cnv2_mainloop_ryzen_asm
PUBLIC cnv2_mainloop_bulldozer_asm
PUBLIC cnv2_double_mainloop_sandybridge_asm
PUBLIC cn_fastv2_mainloop_ivybridge_asm
PUBLIC cn_fastv2_mainloop_ryzen_asm
PUBLIC cn_fastv2_mainloop_bulldozer_asm
PUBLIC cn_fastv2_double_mainloop_sandybridge_asm
PUBLIC cn_liteupx_mainloop_sandybridge_asm
PUBLIC cn_ultralitev2_mainloop_ivybridge_asm
PUBLIC cn_ultralitev2_mainloop_ryzen_asm
PUBLIC cn_ultralitev2_mainloop_bulldozer_asm
PUBLIC cn_ultralitev2_double_mainloop_sandybridge_asm
PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_sandybridge_asm
PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_lite_sandybridge_asm
PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_fast_sandybridge_asm
PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_upx_sandybridge_asm
PUBLIC cn_fastv2_mainloop_soft_aes_sandybridge_asm
PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm PUBLIC cnv2_main_loop_ivybridge_asm
PUBLIC cn_ultralitev2_mainloop_soft_aes_sandybridge_asm PUBLIC cnv2_main_loop_ryzen_asm
PUBLIC cnv2_main_loop_bulldozer_asm
PUBLIC cnv2_double_main_loop_sandybridge_asm
PUBLIC cnv2_main_loop_fastv2_ivybridge_asm
PUBLIC cnv2_main_loop_fastv2_ryzen_asm
PUBLIC cnv2_main_loop_fastv2_bulldozer_asm
PUBLIC cnv2_double_main_loop_fastv2_sandybridge_asm
PUBLIC cnv2_main_loop_ultralite_ivybridge_asm
PUBLIC cnv2_main_loop_ultralite_ryzen_asm
PUBLIC cnv2_main_loop_ultralite_bulldozer_asm
PUBLIC cnv2_double_main_loop_ultralite_sandybridge_asm
PUBLIC cnv1_main_loop_soft_aes_sandybridge_asm
PUBLIC cnv1_main_loop_lite_soft_aes_sandybridge_asm
PUBLIC cnv1_main_loop_fast_soft_aes_sandybridge_asm
PUBLIC cnv1_main_loop_upx_soft_aes_sandybridge_asm
PUBLIC cnv2_main_loop_soft_aes_sandybridge_asm
PUBLIC cnv2_main_loop_fastv2_soft_aes_sandybridge_asm
PUBLIC cnv2_main_loop_ultralite_soft_aes_sandybridge_asm
ALIGN 64 ALIGN 64
cnv1_mainloop_sandybridge_asm PROC cnv1_main_loop_sandybridge_asm PROC
INCLUDE cnv1_mainloop_sandybridge.inc INCLUDE cnv1_main_loop_sandybridge.inc
ret 0 ret 0
cnv1_mainloop_sandybridge_asm ENDP cnv1_main_loop_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_litev1_mainloop_sandybridge_asm PROC cnv1_main_loop_lite_sandybridge_asm PROC
INCLUDE cn_litev1_mainloop_sandybridge.inc INCLUDE cnv1_main_loop_lite_sandybridge.inc
ret 0 ret 0
cn_litev1_mainloop_sandybridge_asm ENDP cnv1_main_loop_lite_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_fast_mainloop_sandybridge_asm PROC cnv1_main_loop_fast_sandybridge_asm PROC
INCLUDE cn_fast_mainloop_sandybridge.inc INCLUDE cnv1_main_loop_fast_sandybridge.inc
ret 0 ret 0
cn_fast_mainloop_sandybridge_asm ENDP cnv1_main_loop_fast_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cnv2_mainloop_ivybridge_asm PROC cnv1_main_loop_upx_sandybridge_asm PROC
INCLUDE cnv1_main_loop_upx_sandybridge.inc
ret 0
cnv1_main_loop_upx_sandybridge_asm ENDP
ALIGN 64
cnv2_main_loop_ivybridge_asm PROC
INCLUDE cnv2_main_loop_ivybridge.inc INCLUDE cnv2_main_loop_ivybridge.inc
ret 0 ret 0
cnv2_mainloop_ivybridge_asm ENDP cnv2_main_loop_ivybridge_asm ENDP
ALIGN 64 ALIGN 64
cnv2_mainloop_ryzen_asm PROC cnv2_main_loop_ryzen_asm PROC
INCLUDE cnv2_main_loop_ryzen.inc INCLUDE cnv2_main_loop_ryzen.inc
ret 0 ret 0
cnv2_mainloop_ryzen_asm ENDP cnv2_main_loop_ryzen_asm ENDP
ALIGN 64 ALIGN 64
cnv2_mainloop_bulldozer_asm PROC cnv2_main_loop_bulldozer_asm PROC
INCLUDE cnv2_main_loop_bulldozer.inc INCLUDE cnv2_main_loop_bulldozer.inc
ret 0 ret 0
cnv2_mainloop_bulldozer_asm ENDP cnv2_main_loop_bulldozer_asm ENDP
ALIGN 64 ALIGN 64
cnv2_double_mainloop_sandybridge_asm PROC cnv2_double_main_loop_sandybridge_asm PROC
INCLUDE cnv2_double_main_loop_sandybridge.inc INCLUDE cnv2_double_main_loop_sandybridge.inc
ret 0 ret 0
cnv2_double_mainloop_sandybridge_asm ENDP cnv2_double_main_loop_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_fastv2_mainloop_ivybridge_asm PROC cnv2_main_loop_fastv2_ivybridge_asm PROC
INCLUDE cn_fastv2_main_loop_ivybridge.inc INCLUDE cnv2_main_loop_fastv2_ivybridge.inc
ret 0 ret 0
cn_fastv2_mainloop_ivybridge_asm ENDP cnv2_main_loop_fastv2_ivybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_fastv2_mainloop_ryzen_asm PROC cnv2_main_loop_fastv2_ryzen_asm PROC
INCLUDE cn_fastv2_main_loop_ryzen.inc INCLUDE cnv2_main_loop_fastv2_ryzen.inc
ret 0 ret 0
cn_fastv2_mainloop_ryzen_asm ENDP cnv2_main_loop_fastv2_ryzen_asm ENDP
ALIGN 64 ALIGN 64
cn_fastv2_mainloop_bulldozer_asm PROC cnv2_main_loop_fastv2_bulldozer_asm PROC
INCLUDE cn_fastv2_main_loop_bulldozer.inc INCLUDE cnv2_main_loop_fastv2_bulldozer.inc
ret 0 ret 0
cn_fastv2_mainloop_bulldozer_asm ENDP cnv2_main_loop_fastv2_bulldozer_asm ENDP
ALIGN 64 ALIGN 64
cn_fastv2_double_mainloop_sandybridge_asm PROC cnv2_double_main_loop_fastv2_sandybridge_asm PROC
INCLUDE cn_fastv2_double_main_loop_sandybridge.inc INCLUDE cnv2_double_main_loop_fastv2_sandybridge.inc
ret 0 ret 0
cn_fastv2_double_mainloop_sandybridge_asm ENDP cnv2_double_main_loop_fastv2_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_liteupx_mainloop_sandybridge_asm PROC cnv2_main_loop_ultralite_ivybridge_asm PROC
INCLUDE cn_liteupx_mainloop_sandybridge.inc INCLUDE cnv2_main_loop_ultralite_ivybridge.inc
ret 0 ret 0
cn_liteupx_mainloop_sandybridge_asm ENDP cnv2_main_loop_ultralite_ivybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_ultralitev2_mainloop_ivybridge_asm PROC cnv2_main_loop_ultralite_ryzen_asm PROC
INCLUDE cn_ultralitev2_main_loop_ivybridge.inc INCLUDE cnv2_main_loop_ultralite_ryzen.inc
ret 0 ret 0
cn_ultralitev2_mainloop_ivybridge_asm ENDP cnv2_main_loop_ultralite_ryzen_asm ENDP
ALIGN 64 ALIGN 64
cn_ultralitev2_mainloop_ryzen_asm PROC cnv2_main_loop_ultralite_bulldozer_asm PROC
INCLUDE cn_ultralitev2_main_loop_ryzen.inc INCLUDE cnv2_main_loop_ultralite_bulldozer.inc
ret 0 ret 0
cn_ultralitev2_mainloop_ryzen_asm ENDP cnv2_main_loop_ultralite_bulldozer_asm ENDP
ALIGN 64 ALIGN 64
cn_ultralitev2_mainloop_bulldozer_asm PROC cnv2_double_main_loop_ultralite_sandybridge_asm PROC
INCLUDE cn_ultralitev2_main_loop_bulldozer.inc INCLUDE cnv2_double_main_loop_ultralite_sandybridge.inc
ret 0 ret 0
cn_ultralitev2_mainloop_bulldozer_asm ENDP cnv2_double_main_loop_ultralite_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_ultralitev2_double_mainloop_sandybridge_asm PROC cnv1_main_loop_soft_aes_sandybridge_asm PROC
INCLUDE cn_ultralitev2_double_main_loop_sandybridge.inc INCLUDE cnv1_main_loop_soft_aes_sandybridge.inc
ret 0 ret 0
cn_ultralitev2_double_mainloop_sandybridge_asm ENDP cnv1_main_loop_soft_aes_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cnv1_mainloop_soft_aes_sandybridge_asm PROC cnv1_main_loop_lite_soft_aes_sandybridge_asm PROC
INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc INCLUDE cnv1_main_loop_lite_soft_aes_sandybridge.inc
ret 0 ret 0
cnv1_mainloop_soft_aes_sandybridge_asm ENDP cnv1_main_loop_lite_soft_aes_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_litev1_mainloop_soft_aes_sandybridge_asm PROC cnv1_main_loop_fast_soft_aes_sandybridge_asm PROC
INCLUDE cn_litev1_mainloop_soft_aes_sandybridge.inc INCLUDE cnv1_main_loop_fast_soft_aes_sandybridge.inc
ret 0 ret 0
cn_litev1_mainloop_soft_aes_sandybridge_asm ENDP cnv1_main_loop_fast_soft_aes_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_fast_mainloop_soft_aes_sandybridge_asm PROC cnv1_main_loop_upx_soft_aes_sandybridge_asm PROC
INCLUDE cn_fast_mainloop_soft_aes_sandybridge.inc INCLUDE cnv1_main_loop_upx_soft_aes_sandybridge.inc
ret 0 ret 0
cn_fast_mainloop_soft_aes_sandybridge_asm ENDP cnv1_main_loop_upx_soft_aes_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cnv2_mainloop_soft_aes_sandybridge_asm PROC cnv2_main_loop_soft_aes_sandybridge_asm PROC
INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc INCLUDE cnv2_main_loop_soft_aes_sandybridge.inc
ret 0 ret 0
cnv2_mainloop_soft_aes_sandybridge_asm ENDP cnv2_main_loop_soft_aes_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_fastv2_mainloop_soft_aes_sandybridge_asm PROC cnv2_main_loop_fastv2_soft_aes_sandybridge_asm PROC
INCLUDE cn_fastv2_mainloop_soft_aes_sandybridge.inc INCLUDE cnv2_main_loop_fastv2_soft_aes_sandybridge.inc
ret 0 ret 0
cn_fastv2_mainloop_soft_aes_sandybridge_asm ENDP cnv2_main_loop_fastv2_soft_aes_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cn_liteupx_mainloop_soft_aes_sandybridge_asm PROC cnv2_main_loop_ultralite_soft_aes_sandybridge_asm PROC
INCLUDE cn_liteupx_mainloop_soft_aes_sandybridge.inc INCLUDE cnv2_main_loop_ultralite_soft_aes_sandybridge.inc
ret 0 ret 0
cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP cnv2_main_loop_ultralite_soft_aes_sandybridge_asm ENDP
ALIGN 64
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm PROC
INCLUDE cn_ultralitev2_mainloop_soft_aes_sandybridge.inc
ret 0
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm ENDP
_TEXT_CN_MAINLOOP ENDS _TEXT_CN_MAINLOOP ENDS
END END

View file

@ -3,142 +3,146 @@
# define FN_PREFIX(fn) fn # define FN_PREFIX(fn) fn
.section .text .section .text
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_sandybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm)
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) .global FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm)
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm)
.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm)
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm)
.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm)
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_ivybridge_asm)
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_ryzen_asm)
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_bulldozer_asm)
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_double_main_loop_sandybridge_asm)
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm)
.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm)
.global FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm)
.global FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm)
.global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm)
ALIGN 64 ALIGN 64
FN_PREFIX(cnv1_mainloop_sandybridge_asm): FN_PREFIX(cnv1_main_loop_sandybridge_asm):
#include "../cnv1_mainloop_sandybridge.inc" #include "../cnv1_main_loop_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_litev1_mainloop_sandybridge_asm): FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm):
#include "../cn_litev1_mainloop_sandybridge.inc" #include "../cnv1_main_loop_lite_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_fast_mainloop_sandybridge_asm): FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm):
#include "../cn_fast_mainloop_sandybridge.inc" #include "../cnv1_main_loop_fast_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_mainloop_ivybridge_asm): FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm):
#include "../cnv1_main_loop_upx_sandybridge.inc"
ret 0
ALIGN 64
FN_PREFIX(cnv2_main_loop_ivybridge_asm):
#include "../cnv2_main_loop_ivybridge.inc" #include "../cnv2_main_loop_ivybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_mainloop_ryzen_asm): FN_PREFIX(cnv2_main_loop_ryzen_asm):
#include "../cnv2_main_loop_ryzen.inc" #include "../cnv2_main_loop_ryzen.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_mainloop_bulldozer_asm): FN_PREFIX(cnv2_main_loop_bulldozer_asm):
#include "../cnv2_main_loop_bulldozer.inc" #include "../cnv2_main_loop_bulldozer.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv2_double_main_loop_sandybridge_asm):
#include "../cnv2_double_main_loop_sandybridge.inc" #include "../cnv2_double_main_loop_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm): FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm):
#include "../cn_fastv2_main_loop_ivybridge.inc" #include "../cnv2_main_loop_fastv2_ivybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_fastv2_mainloop_ryzen_asm): FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm):
#include "../cn_fastv2_main_loop_ryzen.inc" #include "../cnv2_main_loop_fastv2_ryzen.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm): FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm):
#include "../cn_fastv2_main_loop_bulldozer.inc" #include "../cnv2_main_loop_fastv2_bulldozer.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm):
#include "../cn_fastv2_double_main_loop_sandybridge.inc" #include "../cnv2_double_main_loop_fastv2_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm): FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm):
#include "../cn_liteupx_mainloop_sandybridge.inc" #include "../cnv2_main_loop_ultralite_ivybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm): FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm):
#include "../cn_ultralitev2_main_loop_ivybridge.inc" #include "../cnv2_main_loop_ultralite_ryzen.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm): FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm):
#include "../cn_ultralitev2_main_loop_ryzen.inc" #include "../cnv2_main_loop_ultralite_bulldozer.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm): FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm):
#include "../cn_ultralitev2_main_loop_bulldozer.inc" #include "../cnv2_double_main_loop_ultralite_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm): FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm):
#include "../cn_ultralitev2_double_main_loop_sandybridge.inc" #include "../cnv1_main_loop_soft_aes_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm):
#include "../cnv1_mainloop_soft_aes_sandybridge.inc" #include "../cnv1_main_loop_lite_soft_aes_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm):
#include "../cn_litev1_mainloop_soft_aes_sandybridge.inc" #include "../cnv1_main_loop_fast_soft_aes_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm):
#include "../cn_fast_mainloop_soft_aes_sandybridge.inc" #include "../cnv1_main_loop_upx_soft_aes_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm):
#include "../cnv2_mainloop_soft_aes_sandybridge.inc" #include "../cnv2_main_loop_soft_aes_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm):
#include "../cn_fastv2_mainloop_soft_aes_sandybridge.inc" #include "../cnv2_main_loop_fastv2_soft_aes_sandybridge.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm): FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm):
#include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc" #include "../cnv2_main_loop_ultralite_soft_aes_sandybridge.inc"
ret 0
ALIGN 64
FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm):
#include "../cn_ultralitev2_mainloop_soft_aes_sandybridge.inc"
ret 0 ret 0

View file

@ -1,410 +0,0 @@
mov rax, rsp
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 184
stmxcsr DWORD PTR [rsp+272]
mov DWORD PTR [rsp+276], 24448
ldmxcsr DWORD PTR [rsp+276]
mov r13, QWORD PTR [rcx+224]
mov r9, rdx
mov r10, QWORD PTR [rcx+32]
mov r8, rcx
xor r10, QWORD PTR [rcx]
mov r14d, 65536
mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov rdi, QWORD PTR [r9+32]
xor rdi, QWORD PTR [r9]
mov rbp, QWORD PTR [r9+40]
xor rbp, QWORD PTR [r9+8]
movq xmm0, rdx
movaps XMMWORD PTR [rax-88], xmm6
movaps XMMWORD PTR [rax-104], xmm7
movaps XMMWORD PTR [rax-120], xmm8
movaps XMMWORD PTR [rsp+112], xmm9
movaps XMMWORD PTR [rsp+96], xmm10
movaps XMMWORD PTR [rsp+80], xmm11
movaps XMMWORD PTR [rsp+64], xmm12
movaps XMMWORD PTR [rsp+48], xmm13
movaps XMMWORD PTR [rsp+32], xmm14
movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10
movq xmm4, QWORD PTR [r8+96]
and edx, 131056
mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r8+72]
movq xmm5, QWORD PTR [r8+104]
movq xmm7, rax
mov eax, 1
shl rax, 52
movq xmm14, rax
punpcklqdq xmm14, xmm14
mov eax, 1023
shl rax, 52
movq xmm12, rax
punpcklqdq xmm12, xmm12
mov rax, QWORD PTR [r8+80]
xor rax, QWORD PTR [r8+64]
punpcklqdq xmm7, xmm0
movq xmm0, rcx
mov rcx, QWORD PTR [r9+56]
xor rcx, QWORD PTR [r9+24]
movq xmm3, rax
mov rax, QWORD PTR [r9+48]
xor rax, QWORD PTR [r9+16]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp], r13
mov rcx, QWORD PTR [r9+88]
xor rcx, QWORD PTR [r9+72]
movq xmm6, rax
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
punpcklqdq xmm6, xmm0
movq xmm0, rcx
mov QWORD PTR [rsp+256], r10
mov rcx, rdi
mov QWORD PTR [rsp+264], r11
movq xmm8, rax
and ecx, 131056
punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, QWORD PTR [r9+104]
lea r8, QWORD PTR [rcx+rsi]
movdqu xmm11, XMMWORD PTR [r8]
punpcklqdq xmm5, xmm0
lea r9, QWORD PTR [rdx+r13]
movdqu xmm15, XMMWORD PTR [r9]
ALIGN 64
main_loop_double_ultralitev2_sandybridge:
movdqu xmm9, xmm15
mov eax, edx
mov ebx, edx
xor eax, 16
xor ebx, 32
xor edx, 48
movq xmm0, r11
movq xmm2, r10
punpcklqdq xmm2, xmm0
aesenc xmm9, xmm2
movdqu xmm0, XMMWORD PTR [rax+r13]
movdqu xmm1, XMMWORD PTR [rbx+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [rbx+r13], xmm0
movdqu xmm0, XMMWORD PTR [rdx+r13]
movdqu XMMWORD PTR [rdx+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [rax+r13], xmm0
movq r11, xmm9
mov edx, r11d
and edx, 131056
movdqa xmm0, xmm9
pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0
lea rbx, QWORD PTR [rdx+r13]
mov r10, QWORD PTR [rdx+r13]
movdqu xmm10, xmm11
movq xmm0, rbp
movq xmm11, rdi
punpcklqdq xmm11, xmm0
aesenc xmm10, xmm11
mov eax, ecx
mov r12d, ecx
xor eax, 16
xor r12d, 32
xor ecx, 48
movdqu xmm0, XMMWORD PTR [rax+rsi]
paddq xmm0, xmm6
movdqu xmm1, XMMWORD PTR [r12+rsi]
movdqu XMMWORD PTR [r12+rsi], xmm0
paddq xmm1, xmm11
movdqu xmm0, XMMWORD PTR [rcx+rsi]
movdqu XMMWORD PTR [rcx+rsi], xmm1
paddq xmm0, xmm8
movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10
and ecx, 131056
movdqa xmm0, xmm10
pxor xmm0, xmm6
movdqu XMMWORD PTR [r8], xmm0
mov r12, QWORD PTR [rcx+rsi]
mov r9, QWORD PTR [rbx+8]
xor edx, 16
mov r8d, edx
mov r15d, edx
movq rdx, xmm5
shl rdx, 32
movq rax, xmm4
xor rdx, rax
xor r10, rdx
mov rax, r10
mul r11
mov r11d, r8d
xor r11d, 48
movq xmm0, rdx
xor rdx, [r11+r13]
movq xmm1, rax
xor rax, [r11+r13+8]
punpcklqdq xmm0, xmm1
pxor xmm0, XMMWORD PTR [r8+r13]
xor r8d, 32
movdqu xmm1, XMMWORD PTR [r11+r13]
paddq xmm0, xmm7
paddq xmm1, xmm2
movdqu XMMWORD PTR [r11+r13], xmm0
movdqu xmm0, XMMWORD PTR [r8+r13]
movdqu XMMWORD PTR [r8+r13], xmm1
paddq xmm0, xmm3
movdqu XMMWORD PTR [r15+r13], xmm0
mov r11, QWORD PTR [rsp+256]
add r11, rdx
mov rdx, QWORD PTR [rsp+264]
add rdx, rax
mov QWORD PTR [rbx], r11
xor r11, r10
mov QWORD PTR [rbx+8], rdx
xor rdx, r9
mov QWORD PTR [rsp+256], r11
and r11d, 131056
mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13]
movdqu xmm15, XMMWORD PTR [r11+r13]
lea r13, QWORD PTR [rsi+rcx]
movdqa xmm0, xmm5
psrldq xmm0, 8
movaps xmm2, xmm13
movq r10, xmm0
psllq xmm5, 1
shl r10, 32
movdqa xmm0, xmm9
psrldq xmm0, 8
movdqa xmm1, xmm10
movq r11, xmm0
psrldq xmm1, 8
movq r8, xmm1
psrldq xmm4, 8
movaps xmm0, xmm13
movq rax, xmm4
xor r10, rax
movaps xmm1, xmm13
xor r10, r12
lea rax, QWORD PTR [r11+1]
shr rax, 1
movdqa xmm3, xmm9
punpcklqdq xmm3, xmm10
paddq xmm5, xmm3
movq rdx, xmm5
psrldq xmm5, 8
cvtsi2sd xmm2, rax
or edx, -2147483647
lea rax, QWORD PTR [r8+1]
shr rax, 1
movq r9, xmm5
cvtsi2sd xmm0, rax
or r9d, -2147483647
cvtsi2sd xmm1, rdx
unpcklpd xmm2, xmm0
movaps xmm0, xmm13
cvtsi2sd xmm0, r9
unpcklpd xmm1, xmm0
divpd xmm2, xmm1
paddq xmm2, xmm14
cvttsd2si rax, xmm2
psrldq xmm2, 8
mov rbx, rax
imul rax, rdx
sub r11, rax
js div_fix_1_ultralitev2_sandybridge
div_fix_1_ret_ultralitev2_sandybridge:
cvttsd2si rdx, xmm2
mov rax, rdx
imul rax, r9
movd xmm2, r11d
movd xmm4, ebx
sub r8, rax
js div_fix_2_ultralitev2_sandybridge
div_fix_2_ret_ultralitev2_sandybridge:
movd xmm1, r8d
movd xmm0, edx
punpckldq xmm2, xmm1
punpckldq xmm4, xmm0
punpckldq xmm4, xmm2
paddq xmm3, xmm4
movdqa xmm0, xmm3
psrlq xmm0, 12
paddq xmm0, xmm12
sqrtpd xmm1, xmm0
movq r9, xmm1
movdqa xmm5, xmm1
psrlq xmm5, 19
test r9, 524287
je sqrt_fix_1_ultralitev2_sandybridge
sqrt_fix_1_ret_ultralitev2_sandybridge:
movq r9, xmm10
psrldq xmm1, 8
movq r8, xmm1
test r8, 524287
je sqrt_fix_2_ultralitev2_sandybridge
sqrt_fix_2_ret_ultralitev2_sandybridge:
mov r12d, ecx
mov r8d, ecx
xor r12d, 16
xor r8d, 32
xor ecx, 48
mov rax, r10
mul r9
movq xmm0, rax
movq xmm3, rdx
punpcklqdq xmm3, xmm0
movdqu xmm0, XMMWORD PTR [r12+rsi]
pxor xmm0, xmm3
movdqu xmm1, XMMWORD PTR [r8+rsi]
xor rdx, [r8+rsi]
xor rax, [r8+rsi+8]
movdqu xmm3, XMMWORD PTR [rcx+rsi]
paddq xmm0, xmm6
paddq xmm1, xmm11
paddq xmm3, xmm8
movdqu XMMWORD PTR [r8+rsi], xmm0
movdqu XMMWORD PTR [rcx+rsi], xmm1
movdqu XMMWORD PTR [r12+rsi], xmm3
add rdi, rdx
mov QWORD PTR [r13], rdi
xor rdi, r10
mov ecx, edi
and ecx, 131056
lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8]
add rbp, rax
mov QWORD PTR [r13+8], rbp
movdqu xmm11, XMMWORD PTR [rcx+rsi]
xor rbp, rdx
mov r13, QWORD PTR [rsp]
movdqa xmm3, xmm7
mov rdx, QWORD PTR [rsp+8]
movdqa xmm8, xmm6
mov r10, QWORD PTR [rsp+256]
movdqa xmm7, xmm9
mov r11, QWORD PTR [rsp+264]
movdqa xmm6, xmm10
mov r9, r15
dec r14d
jne main_loop_double_ultralitev2_sandybridge
ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+184]
movaps xmm6, XMMWORD PTR [r11-24]
movaps xmm7, XMMWORD PTR [r11-40]
movaps xmm8, XMMWORD PTR [r11-56]
movaps xmm9, XMMWORD PTR [r11-72]
movaps xmm10, XMMWORD PTR [r11-88]
movaps xmm11, XMMWORD PTR [r11-104]
movaps xmm12, XMMWORD PTR [r11-120]
movaps xmm14, XMMWORD PTR [rsp+32]
movaps xmm15, XMMWORD PTR [rsp+16]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp
div_fix_1_ultralitev2_sandybridge:
dec rbx
add r11, rdx
jmp div_fix_1_ret_ultralitev2_sandybridge
div_fix_2_ultralitev2_sandybridge:
dec rdx
add r8, r9
jmp div_fix_2_ret_ultralitev2_sandybridge
sqrt_fix_1_ultralitev2_sandybridge:
movq r8, xmm3
movdqa xmm0, xmm5
psrldq xmm0, 8
dec r9
mov r11d, -1022
shl r11, 32
mov rax, r9
shr r9, 19
shr rax, 20
mov rdx, r9
sub rdx, rax
lea rdx, [rdx+r11+1]
add rax, r11
imul rdx, rax
sub rdx, r8
adc r9, 0
movq xmm5, r9
punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_ultralitev2_sandybridge
sqrt_fix_2_ultralitev2_sandybridge:
psrldq xmm3, 8
movq r11, xmm3
dec r8
mov ebx, -1022
shl rbx, 32
mov rax, r8
shr r8, 19
shr rax, 20
mov rdx, r8
sub rdx, rax
lea rdx, [rdx+rbx+1]
add rax, rbx
imul rdx, rax
sub rdx, r11
adc r8, 0
movq xmm0, r8
punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_ultralitev2_sandybridge
cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp:

View file

@ -1,180 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 65536
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movd xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movd xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 131056
movaps XMMWORD PTR [rsp+48], xmm6
movd xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movd xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movd xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 16
cnv2_main_loop_ultralitev2_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movd xmm6, r8
pinsrq xmm6, r11, 1
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
mov edi, 1023
shl rdi, 52
movd r14, xmm5
pextrq rax, xmm5, 1
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 131056
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
div r9
mov eax, eax
shl rdx, 32
lea r15, [rax+rdx]
lea rax, [r14+r15]
shr rax, 12
add rax, rdi
movd xmm0, rax
sqrtsd xmm1, xmm0
movd rdi, xmm1
test rdi, 524287
je sqrt_fixup_ultralitev2_bulldozer
shr rdi, 19
sqrt_fixup_ultralitev2_bulldozer_ret:
mov rax, rsi
mul r14
movd xmm1, rax
movd xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 131056
movdqa xmm3, xmm5
dec ebp
jne cnv2_main_loop_ultralitev2_bulldozer
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp cnv2_main_loop_ultralitev2_bulldozer_endp
sqrt_fixup_ultralitev2_bulldozer:
movd r9, xmm5
add r9, r15
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp sqrt_fixup_ultralitev2_bulldozer_ret
cnv2_main_loop_ultralitev2_bulldozer_endp:

View file

@ -1,182 +0,0 @@
mov QWORD PTR [rsp+24], rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 80
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov esi, 65536
mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm4, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
movq xmm3, QWORD PTR [r9+104]
movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 131056
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0
movq xmm0, rcx
punpcklqdq xmm5, xmm0
movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN 64
$main_loop_ultralitev2_ivybridge:
lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d
mov eax, r10d
mov rdi, r15
xor ecx, 16
xor eax, 32
xor r10d, 48
movq xmm0, r11
movq xmm7, r8
punpcklqdq xmm7, xmm0
aesenc xmm6, xmm7
movq rbp, xmm6
mov r9, rbp
and r9d, 131056
movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm1, xmm7
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [rcx+rbx], xmm0
movdqu XMMWORD PTR [rax+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
mov r10, r9
xor r10d, 32
movq rcx, xmm3
mov rax, rcx
shl rax, 32
xor rdi, rax
movdqa xmm0, xmm6
pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx], xmm0
xor rdi, QWORD PTR [r9+rbx]
lea r14, QWORD PTR [r9+rbx]
mov r12, QWORD PTR [r14+8]
xor edx, edx
lea r9d, DWORD PTR [ecx+ecx]
add r9d, ebp
movdqa xmm0, xmm6
psrldq xmm0, 8
or r9d, r13d
movq rax, xmm0
div r9
xorps xmm3, xmm3
mov eax, eax
shl rdx, 32
add rdx, rax
lea r9, QWORD PTR [rdx+rbp]
mov r15, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm3, xmm0
movq rdx, xmm3
test edx, 524287
je $sqrt_fixup_ultralitev2_ivybridge
psrlq xmm3, 19
$sqrt_fixup_ultralitev2_ivybridge_ret:
mov ecx, r10d
mov rax, rdi
mul rbp
movq xmm2, rdx
xor rdx, [rcx+rbx]
add r8, rdx
mov QWORD PTR [r14], r8
xor r8, rdi
mov edi, r8d
and edi, 131056
movq xmm0, rax
xor rax, [rcx+rbx+8]
add r11, rax
mov QWORD PTR [r14+8], r11
punpcklqdq xmm2, xmm0
mov r9d, r10d
xor r9d, 48
xor r10d, 16
pxor xmm2, XMMWORD PTR [r9+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx]
paddq xmm0, xmm5
movdqu xmm1, XMMWORD PTR [rcx+rbx]
paddq xmm2, xmm4
paddq xmm1, xmm7
movdqa xmm5, xmm4
movdqu XMMWORD PTR [r9+rbx], xmm0
movdqa xmm4, xmm6
movdqu XMMWORD PTR [rcx+rbx], xmm2
movdqu XMMWORD PTR [r10+rbx], xmm1
movdqu xmm6, [rdi+rbx]
mov r10d, edi
xor r11, r12
dec rsi
jne $main_loop_ultralitev2_ivybridge
ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160]
movaps xmm6, XMMWORD PTR [rsp+64]
movaps xmm7, XMMWORD PTR [rsp+48]
movaps xmm8, XMMWORD PTR [rsp+32]
add rsp, 80
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
jmp $cnv2_main_loop_ultralitev2_ivybridge_endp
$sqrt_fixup_ultralitev2_ivybridge:
dec rdx
mov r13d, -1022
shl r13, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
add rax, r13
not r13
sub rcx, r13
mov r13d, -2147483647
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm3, rdx
jmp $sqrt_fixup_ultralitev2_ivybridge_ret
$cnv2_main_loop_ultralitev2_ivybridge_endp:

View file

@ -1,179 +0,0 @@
mov QWORD PTR [rsp+16], rbx
mov QWORD PTR [rsp+24], rbp
mov QWORD PTR [rsp+32], rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 64
stmxcsr DWORD PTR [rsp]
mov DWORD PTR [rsp+4], 24448
ldmxcsr DWORD PTR [rsp+4]
mov rax, QWORD PTR [rcx+48]
mov r9, rcx
xor rax, QWORD PTR [rcx+16]
mov ebp, 65536
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40]
mov r10, r8
mov rdx, QWORD PTR [rcx+56]
movq xmm3, rax
xor rdx, QWORD PTR [rcx+24]
xor r11, QWORD PTR [rcx+8]
mov rbx, QWORD PTR [rcx+224]
mov rax, QWORD PTR [r9+80]
xor rax, QWORD PTR [r9+64]
movq xmm0, rdx
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104]
and r10d, 131056
movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+16], xmm8
xorps xmm8, xmm8
mov ax, 1023
shl rax, 52
movq xmm7, rax
mov r15, QWORD PTR [r9+96]
punpcklqdq xmm3, xmm0
movq xmm0, rcx
punpcklqdq xmm4, xmm0
ALIGN 64
$main_loop_ultralitev2_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11
movq xmm6, r8
punpcklqdq xmm6, xmm0
lea rdx, QWORD PTR [r10+rbx]
lea r9, QWORD PTR [rdi+rdi]
shl rdi, 32
mov ecx, r10d
mov eax, r10d
xor ecx, 16
xor eax, 32
xor r10d, 48
aesenc xmm5, xmm6
movdqa xmm2, XMMWORD PTR [rcx+rbx]
movdqa xmm1, XMMWORD PTR [rax+rbx]
movdqa xmm0, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
paddq xmm0, xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm0
movdqa XMMWORD PTR [rax+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movaps xmm1, xmm8
mov rsi, r15
xor rsi, rdi
movq r14, xmm5
movdqa xmm0, xmm5
pxor xmm0, xmm3
mov r10, r14
and r10d, 131056
movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx]
mov r13, QWORD PTR [r10+rbx+8]
add r9d, r14d
or r9d, -2147483647
xor edx, edx
movdqa xmm0, xmm5
psrldq xmm0, 8
movq rax, xmm0
div r9
movq xmm0, rax
movq xmm1, rdx
punpckldq xmm0, xmm1
movq r15, xmm0
paddq xmm0, xmm5
movdqa xmm2, xmm0
psrlq xmm0, 12
paddq xmm0, xmm7
sqrtsd xmm1, xmm0
movq rdi, xmm1
test rdi, 524287
je $sqrt_fixup_ultralitev2_ryzen
shr rdi, 19
$sqrt_fixup_ultralitev2_ryzen_ret:
mov rax, rsi
mul r14
movq xmm1, rax
movq xmm0, rdx
punpcklqdq xmm0, xmm1
mov r9d, r10d
mov ecx, r10d
xor r9d, 16
xor ecx, 32
xor r10d, 48
movdqa xmm1, XMMWORD PTR [rcx+rbx]
xor rdx, [rcx+rbx]
xor rax, [rcx+rbx+8]
movdqa xmm2, XMMWORD PTR [r9+rbx]
pxor xmm2, xmm0
paddq xmm4, XMMWORD PTR [r10+rbx]
paddq xmm2, xmm3
paddq xmm1, xmm6
movdqa XMMWORD PTR [r9+rbx], xmm4
movdqa XMMWORD PTR [rcx+rbx], xmm2
movdqa XMMWORD PTR [r10+rbx], xmm1
movdqa xmm4, xmm3
add r8, rdx
add r11, rax
mov QWORD PTR [r12], r8
xor r8, rsi
mov QWORD PTR [r12+8], r11
mov r10, r8
xor r11, r13
and r10d, 131056
movdqa xmm3, xmm5
dec ebp
jne $main_loop_ultralitev2_ryzen
ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48]
lea r11, QWORD PTR [rsp+64]
mov rbx, QWORD PTR [r11+56]
mov rbp, QWORD PTR [r11+64]
mov rsi, QWORD PTR [r11+72]
movaps xmm8, XMMWORD PTR [r11-48]
movaps xmm7, XMMWORD PTR [rsp+32]
mov rsp, r11
pop r15
pop r14
pop r13
pop r12
pop rdi
jmp $cnv2_main_loop_ultralitev2_ryzen_endp
$sqrt_fixup_ultralitev2_ryzen:
movq r9, xmm2
dec rdi
mov edx, -1022
shl rdx, 32
mov rax, rdi
shr rdi, 19
shr rax, 20
mov rcx, rdi
sub rcx, rax
lea rcx, [rcx+rdx+1]
add rax, rdx
imul rcx, rax
sub rcx, r9
adc rdi, 0
jmp $sqrt_fixup_ultralitev2_ryzen_ret
$cnv2_main_loop_ultralitev2_ryzen_endp:

View file

@ -1,267 +0,0 @@
mov QWORD PTR [rsp+8], rcx
push rbx
push rbp
push rsi
push rdi
push r12
push r13
push r14
push r15
sub rsp, 152
stmxcsr DWORD PTR [rsp+4]
mov DWORD PTR [rsp], 24448
ldmxcsr DWORD PTR [rsp]
mov rax, QWORD PTR [rcx+48]
mov r10, rcx
xor rax, QWORD PTR [rcx+16]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
mov r9, QWORD PTR [rcx+40]
xor r9, QWORD PTR [rcx+8]
movq xmm4, rax
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r11, QWORD PTR [rcx+224]
mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r10+72]
mov rax, QWORD PTR [r10+80]
movq xmm0, rdx
xor rax, QWORD PTR [r10+64]
movaps XMMWORD PTR [rsp+16], xmm6
movaps XMMWORD PTR [rsp+32], xmm7
movaps XMMWORD PTR [rsp+48], xmm8
movaps XMMWORD PTR [rsp+64], xmm9
movaps XMMWORD PTR [rsp+80], xmm10
movaps XMMWORD PTR [rsp+96], xmm11
movaps XMMWORD PTR [rsp+112], xmm12
movaps XMMWORD PTR [rsp+128], xmm13
movq xmm5, rax
mov ax, 1023
shl rax, 52
movq xmm8, rax
mov rax, r8
punpcklqdq xmm4, xmm0
and eax, 131056
movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx
mov rcx, QWORD PTR [r10+104]
xorps xmm9, xmm9
mov QWORD PTR [rsp+248], rax
movq xmm12, r11
mov QWORD PTR [rsp+240], r9
punpcklqdq xmm5, xmm0
movq xmm13, rcx
mov r12d, 65536
ALIGN 64
cnv2_mainloop_soft_aes_ultralitev2_sandybridge:
movd xmm11, r12d
mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11]
mov esi, DWORD PTR [r13]
movq xmm0, r9
mov r10d, DWORD PTR [r13+4]
movq xmm7, r8
mov ebp, DWORD PTR [r13+12]
mov r14d, DWORD PTR [r13+8]
mov rdx, QWORD PTR [rsp+248]
movzx ecx, sil
shr esi, 8
punpcklqdq xmm7, xmm0
mov r15d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
mov edi, DWORD PTR [r12+rcx*4]
movzx ecx, r14b
shr r14d, 8
mov ebx, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
shr ebp, 8
mov r9d, DWORD PTR [r12+rcx*4]
movzx ecx, r10b
shr r10d, 8
xor r15d, DWORD PTR [r12+rcx*4+1024]
movzx ecx, r14b
shr r14d, 8
mov eax, r14d
shr eax, 8
xor edi, DWORD PTR [r12+rcx*4+1024]
add eax, 256
movzx ecx, bpl
shr ebp, 8
xor ebx, DWORD PTR [r12+rcx*4+1024]
movzx ecx, sil
shr esi, 8
xor r9d, DWORD PTR [r12+rcx*4+1024]
add r12, 2048
movzx ecx, r10b
shr r10d, 8
add r10d, 256
mov r11d, DWORD PTR [r12+rax*4]
xor r11d, DWORD PTR [r12+rcx*4]
xor r11d, r9d
movzx ecx, sil
mov r10d, DWORD PTR [r12+r10*4]
shr esi, 8
add esi, 256
xor r10d, DWORD PTR [r12+rcx*4]
movzx ecx, bpl
xor r10d, ebx
shr ebp, 8
movd xmm1, r11d
add ebp, 256
movq r11, xmm12
mov r9d, DWORD PTR [r12+rcx*4]
xor r9d, DWORD PTR [r12+rsi*4]
mov eax, DWORD PTR [r12+rbp*4]
xor r9d, edi
movzx ecx, r14b
movd xmm0, r10d
movd xmm2, r9d
xor eax, DWORD PTR [r12+rcx*4]
mov rcx, rdx
xor eax, r15d
punpckldq xmm2, xmm1
xor rcx, 16
movd xmm6, eax
mov rax, rdx
punpckldq xmm6, xmm0
xor rax, 32
punpckldq xmm6, xmm2
xor rdx, 48
movdqu xmm2, XMMWORD PTR [rcx+r11]
pxor xmm6, xmm7
paddq xmm2, xmm4
movdqu xmm1, XMMWORD PTR [rax+r11]
movdqu xmm0, XMMWORD PTR [rdx+r11]
paddq xmm0, xmm5
movdqu XMMWORD PTR [rcx+r11], xmm0
movdqu XMMWORD PTR [rax+r11], xmm2
movq rcx, xmm13
paddq xmm1, xmm7
movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6
mov r10, rdi
and r10d, 131056
xor edx, edx
mov rax, rcx
shl rax, 32
movq rbx, xmm10
xor rbx, rax
lea r9, QWORD PTR [rcx+rcx]
add r9d, edi
movdqa xmm0, xmm6
pxor xmm0, xmm4
mov ecx, -2147483647
movdqu XMMWORD PTR [r13], xmm0
or r9, rcx
movdqa xmm0, xmm6
movaps xmm1, xmm9
psrldq xmm0, 8
movq rax, xmm0
xor rbx, QWORD PTR [r10+r11]
lea r14, QWORD PTR [r10+r11]
mov rbp, QWORD PTR [r14+8]
div r9
shl rdx, 32
mov eax, eax
add rdx, rax
lea r9, QWORD PTR [rdx+rdi]
movq xmm10, rdx
mov rax, r9
shr rax, 12
movq xmm0, rax
paddq xmm0, xmm8
sqrtsd xmm1, xmm0
movq rdx, xmm1
test rdx, 524287
je sqrt_fixup_soft_aes_ultralitev2_sandybridge
psrlq xmm1, 19
sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret:
mov r9, r10
movdqa xmm13, xmm1
xor r9, 16
mov rcx, r10
xor rcx, 32
xor r10, 48
mov rax, rbx
mul rdi
movdqu xmm2, XMMWORD PTR [r9+r11]
movdqu xmm1, XMMWORD PTR [rcx+r11]
paddq xmm1, xmm7
movq xmm0, rax
movq xmm3, rdx
xor rax, QWORD PTR [r11+rcx+8]
xor rdx, QWORD PTR [rcx+r11]
punpcklqdq xmm3, xmm0
add r8, rdx
movdqu xmm0, XMMWORD PTR [r10+r11]
pxor xmm2, xmm3
paddq xmm0, xmm5
paddq xmm2, xmm4
movdqu XMMWORD PTR [r9+r11], xmm0
movdqa xmm5, xmm4
mov r9, QWORD PTR [rsp+240]
movdqa xmm4, xmm6
add r9, rax
movdqu XMMWORD PTR [rcx+r11], xmm2
movdqu XMMWORD PTR [r10+r11], xmm1
mov r10, QWORD PTR [rsp+224]
movd r12d, xmm11
mov QWORD PTR [r14], r8
xor r8, rbx
mov rax, r8
mov QWORD PTR [r14+8], r9
and eax, 131056
xor r9, rbp
mov QWORD PTR [rsp+240], r9
mov QWORD PTR [rsp+248], rax
sub r12d, 1
jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge
ldmxcsr DWORD PTR [rsp+4]
movaps xmm6, XMMWORD PTR [rsp+16]
movaps xmm7, XMMWORD PTR [rsp+32]
movaps xmm8, XMMWORD PTR [rsp+48]
movaps xmm9, XMMWORD PTR [rsp+64]
movaps xmm10, XMMWORD PTR [rsp+80]
movaps xmm11, XMMWORD PTR [rsp+96]
movaps xmm12, XMMWORD PTR [rsp+112]
movaps xmm13, XMMWORD PTR [rsp+128]
add rsp, 152
pop r15
pop r14
pop r13
pop r12
pop rdi
pop rsi
pop rbp
pop rbx
jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp
sqrt_fixup_soft_aes_ultralitev2_sandybridge:
dec rdx
mov r15d, -1022
shl r15, 32
mov rax, rdx
shr rdx, 19
shr rax, 20
mov rcx, rdx
sub rcx, rax
lea rcx, [rcx+r15+1]
add rax, r15
imul rcx, rax
sub rcx, r9
adc rdx, 0
movq xmm1, rdx
jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret
cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp:

View file

@ -5,7 +5,7 @@
push r14 push r14
push r15 push r15
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov ebp, 262144 mov ebp, ${ITERATIONS}
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56] mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24] xor rdx, QWORD PTR [rcx+24]
@ -18,7 +18,7 @@
xor rdi, QWORD PTR [rcx+8] xor rdi, QWORD PTR [rcx+8]
mov rdx, r8 mov rdx, r8
mov r15, QWORD PTR [rcx+264] mov r15, QWORD PTR [rcx+264]
and edx, 1048560 and edx, ${MASK}
mov r14, QWORD PTR [rax+35] mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192] xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224] mov rsi, QWORD PTR [rcx+224]
@ -26,14 +26,14 @@
movdqu xmm2, XMMWORD PTR [rdx+rsi] movdqu xmm2, XMMWORD PTR [rdx+rsi]
ALIGN 64 ALIGN 64
cn_litev1_mainloop_sandybridge: cnv1_main_loop_${ALGO}_sandybridge:
movq xmm0, rdi movq xmm0, rdi
movq xmm1, r8 movq xmm1, r8
punpcklqdq xmm1, xmm0 punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1 aesenc xmm2, xmm1
movq r10, xmm2 movq r10, xmm2
mov r9d, r10d mov r9d, r10d
and r9d, 1048560 and r9d, ${MASK}
add r9, rsi add r9, rsi
movdqa xmm0, xmm2 movdqa xmm0, xmm2
pxor xmm0, xmm3 pxor xmm0, xmm3
@ -56,11 +56,11 @@ cn_litev1_mainloop_sandybridge:
mov QWORD PTR [r9+8], rax mov QWORD PTR [r9+8], rax
xor r8, rbx xor r8, rbx
mov rdx, r8 mov rdx, r8
and edx, 1048560 and edx, ${MASK}
movdqu xmm2, XMMWORD PTR [rdx+rsi] movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11 xor rdi, r11
dec ebp dec ebp
jne cn_litev1_mainloop_sandybridge jne cnv1_main_loop_${ALGO}_sandybridge
mov rbx, QWORD PTR [rsp+24] mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32] mov rbp, QWORD PTR [rsp+32]

View file

@ -26,7 +26,7 @@
xor r13, QWORD PTR [rcx+8] xor r13, QWORD PTR [rcx+8]
mov rdx, r8 mov rdx, r8
mov rdi, QWORD PTR [rcx+224] mov rdi, QWORD PTR [rcx+224]
and edx, 2097136 and edx, ${MASK}
mov rax, QWORD PTR [rax+35] mov rax, QWORD PTR [rax+35]
xor rax, QWORD PTR [rcx+192] xor rax, QWORD PTR [rcx+192]
movq xmm5, rax movq xmm5, rax
@ -38,10 +38,10 @@
mov rax, QWORD PTR [rcx+264] mov rax, QWORD PTR [rcx+264]
movq xmm7, rax movq xmm7, rax
mov eax, 524288 mov eax, ${ITERATIONS}
ALIGN 64 ALIGN 64
cnv1_mainloop_soft_aes_sandybridge: cnv1_main_loop_${ALGO}_soft_aes_sandybridge:
movq xmm9, rax movq xmm9, rax
mov r12, QWORD PTR [rcx+272] mov r12, QWORD PTR [rcx+272]
mov esi, DWORD PTR [rdx+rdi] mov esi, DWORD PTR [rdx+rdi]
@ -114,7 +114,7 @@ cnv1_mainloop_soft_aes_sandybridge:
pxor xmm3, xmm1 pxor xmm3, xmm1
movq r9, xmm3 movq r9, xmm3
mov r10d, r9d mov r10d, r9d
and r10d, 2097136 and r10d, ${MASK}
movdqa xmm0, xmm3 movdqa xmm0, xmm3
pxor xmm0, xmm4 pxor xmm0, xmm4
movdqu XMMWORD PTR [rdx+rdi], xmm0 movdqu XMMWORD PTR [rdx+rdi], xmm0
@ -141,10 +141,10 @@ cnv1_mainloop_soft_aes_sandybridge:
movq rax, xmm9 movq rax, xmm9
mov rdx, r8 mov rdx, r8
xor r13, r11 xor r13, r11
and edx, 2097136 and edx, ${MASK}
mov QWORD PTR [rsp+64], rdx mov QWORD PTR [rsp+64], rdx
sub eax, 1 sub eax, 1
jne cnv1_mainloop_soft_aes_sandybridge jne cnv1_main_loop_${ALGO}_soft_aes_sandybridge
movaps xmm6, XMMWORD PTR [rsp] movaps xmm6, XMMWORD PTR [rsp]
movaps xmm7, XMMWORD PTR [rsp+16] movaps xmm7, XMMWORD PTR [rsp+16]

View file

@ -1,70 +0,0 @@
mov QWORD PTR [rsp+8], rbx
mov QWORD PTR [rsp+16], rbp
mov QWORD PTR [rsp+24], rsi
mov QWORD PTR [rsp+32], rdi
push r14
push r15
mov rax, QWORD PTR [rcx+48]
mov ebp, 524288
xor rax, QWORD PTR [rcx+16]
mov rdx, QWORD PTR [rcx+56]
xor rdx, QWORD PTR [rcx+24]
mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx]
movq xmm3, rax
mov rax, QWORD PTR [rcx+256]
mov rdi, QWORD PTR [rcx+40]
movq xmm0, rdx
xor rdi, QWORD PTR [rcx+8]
mov rdx, r8
mov r15, QWORD PTR [rcx+264]
and edx, 2097136
mov r14, QWORD PTR [rax+35]
xor r14, QWORD PTR [rcx+192]
mov rsi, QWORD PTR [rcx+224]
punpcklqdq xmm3, xmm0
movdqu xmm2, XMMWORD PTR [rdx+rsi]
ALIGN 64
cnv1_mainloop_sandybridge:
movq xmm0, rdi
movq xmm1, r8
punpcklqdq xmm1, xmm0
aesenc xmm2, xmm1
movq r10, xmm2
mov r9d, r10d
and r9d, 2097136
add r9, rsi
movdqa xmm0, xmm2
pxor xmm0, xmm3
movdqa xmm3, xmm2
movdqu XMMWORD PTR [rdx+rsi], xmm0
psrldq xmm0, 11
movq rax, xmm0
movzx eax, al
movzx eax, BYTE PTR [rax+r15]
mov BYTE PTR [rsi+rdx+11], al
mov rbx, QWORD PTR [r9]
mov r11, QWORD PTR [r9+8]
mov rax, rbx
mul r10
add r8, rdx
mov QWORD PTR [r9], r8
add rdi, rax
mov rax, r14
xor rax, rdi
mov QWORD PTR [r9+8], rax
xor r8, rbx
mov rdx, r8
and edx, 2097136
movdqu xmm2, XMMWORD PTR [rdx+rsi]
xor rdi, r11
dec ebp
jne cnv1_mainloop_sandybridge
mov rbx, QWORD PTR [rsp+24]
mov rbp, QWORD PTR [rsp+32]
mov rsi, QWORD PTR [rsp+40]
mov rdi, QWORD PTR [rsp+48]
pop r15
pop r14

View file

@ -18,7 +18,7 @@
mov r10, QWORD PTR [rcx+32] mov r10, QWORD PTR [rcx+32]
mov r8, rcx mov r8, rcx
xor r10, QWORD PTR [rcx] xor r10, QWORD PTR [rcx]
mov r14d, 524288 mov r14d, ${ITERATIONS}
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8] xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224] mov rsi, QWORD PTR [rdx+224]
@ -41,7 +41,7 @@
movaps XMMWORD PTR [rsp+16], xmm15 movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10 mov rdx, r10
movq xmm4, QWORD PTR [r8+96] movq xmm4, QWORD PTR [r8+96]
and edx, 2097136 and edx, ${MASK}
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13 xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
@ -83,7 +83,7 @@
mov rcx, rdi mov rcx, rdi
mov QWORD PTR [rsp+264], r11 mov QWORD PTR [rsp+264], r11
movq xmm8, rax movq xmm8, rax
and ecx, 2097136 and ecx, ${MASK}
punpcklqdq xmm8, xmm0 punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96] movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
@ -95,7 +95,7 @@
movdqu xmm15, XMMWORD PTR [r9] movdqu xmm15, XMMWORD PTR [r9]
ALIGN 64 ALIGN 64
main_loop_double_sandybridge: cnv2_double_main_loop_${ALGO}_sandybridge:
movdqu xmm9, xmm15 movdqu xmm9, xmm15
mov eax, edx mov eax, edx
mov ebx, edx mov ebx, edx
@ -120,7 +120,7 @@ main_loop_double_sandybridge:
movq r11, xmm9 movq r11, xmm9
mov edx, r11d mov edx, r11d
and edx, 2097136 and edx, ${MASK}
movdqa xmm0, xmm9 movdqa xmm0, xmm9
pxor xmm0, xmm7 pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0 movdqu XMMWORD PTR [r9], xmm0
@ -151,7 +151,7 @@ main_loop_double_sandybridge:
movdqu XMMWORD PTR [rax+rsi], xmm0 movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10 movq rcx, xmm10
and ecx, 2097136 and ecx, ${MASK}
movdqa xmm0, xmm10 movdqa xmm0, xmm10
pxor xmm0, xmm6 pxor xmm0, xmm6
@ -199,7 +199,7 @@ main_loop_double_sandybridge:
mov QWORD PTR [rbx+8], rdx mov QWORD PTR [rbx+8], rdx
xor rdx, r9 xor rdx, r9
mov QWORD PTR [rsp+256], r11 mov QWORD PTR [rsp+256], r11
and r11d, 2097136 and r11d, ${MASK}
mov QWORD PTR [rsp+264], rdx mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11 mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13] lea r15, QWORD PTR [r11+r13]
@ -249,8 +249,8 @@ main_loop_double_sandybridge:
mov rbx, rax mov rbx, rax
imul rax, rdx imul rax, rdx
sub r11, rax sub r11, rax
js div_fix_1_sandybridge js div_fix_1_${ALGO}_sandybridge
div_fix_1_ret_sandybridge: div_fix_1_ret_${ALGO}_sandybridge:
cvttsd2si rdx, xmm2 cvttsd2si rdx, xmm2
mov rax, rdx mov rax, rdx
@ -258,8 +258,8 @@ div_fix_1_ret_sandybridge:
movd xmm2, r11d movd xmm2, r11d
movd xmm4, ebx movd xmm4, ebx
sub r8, rax sub r8, rax
js div_fix_2_sandybridge js div_fix_2_${ALGO}_sandybridge
div_fix_2_ret_sandybridge: div_fix_2_ret_${ALGO}_sandybridge:
movd xmm1, r8d movd xmm1, r8d
movd xmm0, edx movd xmm0, edx
@ -275,15 +275,15 @@ div_fix_2_ret_sandybridge:
movdqa xmm5, xmm1 movdqa xmm5, xmm1
psrlq xmm5, 19 psrlq xmm5, 19
test r9, 524287 test r9, 524287
je sqrt_fix_1_sandybridge je sqrt_fix_1_${ALGO}_sandybridge
sqrt_fix_1_ret_sandybridge: sqrt_fix_1_ret_${ALGO}_sandybridge:
movq r9, xmm10 movq r9, xmm10
psrldq xmm1, 8 psrldq xmm1, 8
movq r8, xmm1 movq r8, xmm1
test r8, 524287 test r8, 524287
je sqrt_fix_2_sandybridge je sqrt_fix_2_${ALGO}_sandybridge
sqrt_fix_2_ret_sandybridge: sqrt_fix_2_ret_${ALGO}_sandybridge:
mov r12d, ecx mov r12d, ecx
mov r8d, ecx mov r8d, ecx
@ -313,7 +313,7 @@ sqrt_fix_2_ret_sandybridge:
mov QWORD PTR [r13], rdi mov QWORD PTR [r13], rdi
xor rdi, r10 xor rdi, r10
mov ecx, edi mov ecx, edi
and ecx, 2097136 and ecx, ${MASK}
lea r8, QWORD PTR [rcx+rsi] lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8] mov rdx, QWORD PTR [r13+8]
@ -331,7 +331,7 @@ sqrt_fix_2_ret_sandybridge:
movdqa xmm6, xmm10 movdqa xmm6, xmm10
mov r9, r15 mov r9, r15
dec r14d dec r14d
jne main_loop_double_sandybridge jne cnv2_double_main_loop_${ALGO}_sandybridge
ldmxcsr DWORD PTR [rsp+272] ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48] movaps xmm13, XMMWORD PTR [rsp+48]
@ -354,19 +354,19 @@ sqrt_fix_2_ret_sandybridge:
pop rsi pop rsi
pop rbp pop rbp
pop rbx pop rbx
jmp cnv2_double_mainloop_asm_sandybridge_endp jmp cnv2_double_main_loop_${ALGO}_sandybridge_endp
div_fix_1_sandybridge: div_fix_1_${ALGO}_sandybridge:
dec rbx dec rbx
add r11, rdx add r11, rdx
jmp div_fix_1_ret_sandybridge jmp div_fix_1_ret_${ALGO}_sandybridge
div_fix_2_sandybridge: div_fix_2_${ALGO}_sandybridge:
dec rdx dec rdx
add r8, r9 add r8, r9
jmp div_fix_2_ret_sandybridge jmp div_fix_2_ret_${ALGO}_sandybridge
sqrt_fix_1_sandybridge: sqrt_fix_1_${ALGO}_sandybridge:
movq r8, xmm3 movq r8, xmm3
movdqa xmm0, xmm5 movdqa xmm0, xmm5
psrldq xmm0, 8 psrldq xmm0, 8
@ -385,9 +385,9 @@ sqrt_fix_1_sandybridge:
adc r9, 0 adc r9, 0
movq xmm5, r9 movq xmm5, r9
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp sqrt_fix_1_ret_sandybridge jmp sqrt_fix_1_ret_${ALGO}_sandybridge
sqrt_fix_2_sandybridge: sqrt_fix_2_${ALGO}_sandybridge:
psrldq xmm3, 8 psrldq xmm3, 8
movq r11, xmm3 movq r11, xmm3
dec r8 dec r8
@ -405,6 +405,6 @@ sqrt_fix_2_sandybridge:
adc r8, 0 adc r8, 0
movq xmm0, r8 movq xmm0, r8
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp sqrt_fix_2_ret_sandybridge jmp sqrt_fix_2_ret_${ALGO}_sandybridge
cnv2_double_mainloop_asm_sandybridge_endp: cnv2_double_main_loop_${ALGO}_sandybridge_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov ebp, 524288 mov ebp, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
@ -31,7 +31,7 @@
mov rcx, QWORD PTR [rcx+88] mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72] xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104] mov rdi, QWORD PTR [r9+104]
and r10d, 2097136 and r10d, ${MASK}
movaps XMMWORD PTR [rsp+48], xmm6 movaps XMMWORD PTR [rsp+48], xmm6
movd xmm4, rax movd xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7 movaps XMMWORD PTR [rsp+32], xmm7
@ -46,7 +46,7 @@
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
ALIGN 16 ALIGN 16
cnv2_main_loop_bulldozer: cnv2_main_loop_${ALGO}_bulldozer:
movdqa xmm5, XMMWORD PTR [r10+rbx] movdqa xmm5, XMMWORD PTR [r10+rbx]
movd xmm6, r8 movd xmm6, r8
pinsrq xmm6, r11, 1 pinsrq xmm6, r11, 1
@ -83,7 +83,7 @@ cnv2_main_loop_bulldozer:
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm3 pxor xmm0, xmm3
mov r10, r14 mov r10, r14
and r10d, 2097136 and r10d, ${MASK}
movdqa XMMWORD PTR [rdx], xmm0 movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx] xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx]
@ -103,10 +103,10 @@ cnv2_main_loop_bulldozer:
sqrtsd xmm1, xmm0 sqrtsd xmm1, xmm0
movd rdi, xmm1 movd rdi, xmm1
test rdi, 524287 test rdi, 524287
je sqrt_fixup_bulldozer je sqrt_fixup_${ALGO}_bulldozer
shr rdi, 19 shr rdi, 19
sqrt_fixup_bulldozer_ret: sqrt_fixup_${ALGO}_bulldozer_ret:
mov rax, rsi mov rax, rsi
mul r14 mul r14
movd xmm1, rax movd xmm1, rax
@ -138,10 +138,10 @@ sqrt_fixup_bulldozer_ret:
mov QWORD PTR [r12+8], r11 mov QWORD PTR [r12+8], r11
mov r10, r8 mov r10, r8
xor r11, r13 xor r11, r13
and r10d, 2097136 and r10d, ${MASK}
movdqa xmm3, xmm5 movdqa xmm3, xmm5
dec ebp dec ebp
jne cnv2_main_loop_bulldozer jne cnv2_main_loop_${ALGO}_bulldozer
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48] movaps xmm6, XMMWORD PTR [rsp+48]
@ -157,9 +157,9 @@ sqrt_fixup_bulldozer_ret:
pop r13 pop r13
pop r12 pop r12
pop rdi pop rdi
jmp cnv2_main_loop_bulldozer_endp jmp cnv2_main_loop_${ALGO}_bulldozer_endp
sqrt_fixup_bulldozer: sqrt_fixup_${ALGO}_bulldozer:
movd r9, xmm5 movd r9, xmm5
add r9, r15 add r9, r15
dec rdi dec rdi
@ -175,6 +175,6 @@ sqrt_fixup_bulldozer:
imul rcx, rax imul rcx, rax
sub rcx, r9 sub rcx, r9
adc rdi, 0 adc rdi, 0
jmp sqrt_fixup_bulldozer_ret jmp sqrt_fixup_${ALGO}_bulldozer_ret
cnv2_main_loop_bulldozer_endp: cnv2_main_loop_${ALGO}_bulldozer_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov esi, 524288 mov esi, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647 mov r13d, -2147483647
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
@ -35,7 +35,7 @@
movaps XMMWORD PTR [rsp+64], xmm6 movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7 movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8 movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136 and r10d, ${MASK}
movq xmm5, rax movq xmm5, rax
mov ax, 1023 mov ax, 1023
@ -48,7 +48,7 @@
movdqu xmm6, XMMWORD PTR [r10+rbx] movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN 64 ALIGN 64
$main_loop_ivybridge: cnv2_main_loop_${ALGO}_ivybridge:
lea rdx, QWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d mov ecx, r10d
mov eax, r10d mov eax, r10d
@ -62,7 +62,7 @@ $main_loop_ivybridge:
aesenc xmm6, xmm7 aesenc xmm6, xmm7
movq rbp, xmm6 movq rbp, xmm6
mov r9, rbp mov r9, rbp
and r9d, 2097136 and r9d, ${MASK}
movdqu xmm2, XMMWORD PTR [rcx+rbx] movdqu xmm2, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm0, XMMWORD PTR [r10+rbx] movdqu xmm0, XMMWORD PTR [r10+rbx]
@ -105,9 +105,9 @@ $main_loop_ivybridge:
sqrtsd xmm3, xmm0 sqrtsd xmm3, xmm0
movq rdx, xmm3 movq rdx, xmm3
test edx, 524287 test edx, 524287
je $sqrt_fixup_ivybridge je sqrt_fixup_${ALGO}_ivybridge
psrlq xmm3, 19 psrlq xmm3, 19
$sqrt_fixup_ivybridge_ret: sqrt_fixup_${ALGO}_ivybridge_ret:
mov ecx, r10d mov ecx, r10d
mov rax, rdi mov rax, rdi
@ -118,7 +118,7 @@ $sqrt_fixup_ivybridge_ret:
mov QWORD PTR [r14], r8 mov QWORD PTR [r14], r8
xor r8, rdi xor r8, rdi
mov edi, r8d mov edi, r8d
and edi, 2097136 and edi, ${MASK}
movq xmm0, rax movq xmm0, rax
xor rax, [rcx+rbx+8] xor rax, [rcx+rbx+8]
add r11, rax add r11, rax
@ -143,7 +143,7 @@ $sqrt_fixup_ivybridge_ret:
mov r10d, edi mov r10d, edi
xor r11, r12 xor r11, r12
dec rsi dec rsi
jne $main_loop_ivybridge jne cnv2_main_loop_${ALGO}_ivybridge
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160] mov rbx, QWORD PTR [rsp+160]
@ -158,9 +158,9 @@ $sqrt_fixup_ivybridge_ret:
pop rdi pop rdi
pop rsi pop rsi
pop rbp pop rbp
jmp $cnv2_main_loop_ivybridge_endp jmp cnv2_main_loop_${ALGO}_ivybridge_endp
$sqrt_fixup_ivybridge: sqrt_fixup_${ALGO}_ivybridge:
dec rdx dec rdx
mov r13d, -1022 mov r13d, -1022
shl r13, 32 shl r13, 32
@ -177,6 +177,6 @@ $sqrt_fixup_ivybridge:
sub rcx, r9 sub rcx, r9
adc rdx, 0 adc rdx, 0
movq xmm3, rdx movq xmm3, rdx
jmp $sqrt_fixup_ivybridge_ret jmp sqrt_fixup_${ALGO}_ivybridge_ret
$cnv2_main_loop_ivybridge_endp: cnv2_main_loop_${ALGO}_ivybridge_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov ebp, 524288 mov ebp, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
@ -31,7 +31,7 @@
mov rcx, QWORD PTR [rcx+88] mov rcx, QWORD PTR [rcx+88]
xor rcx, QWORD PTR [r9+72] xor rcx, QWORD PTR [r9+72]
mov rdi, QWORD PTR [r9+104] mov rdi, QWORD PTR [r9+104]
and r10d, 2097136 and r10d, ${MASK}
movaps XMMWORD PTR [rsp+48], xmm6 movaps XMMWORD PTR [rsp+48], xmm6
movq xmm4, rax movq xmm4, rax
movaps XMMWORD PTR [rsp+32], xmm7 movaps XMMWORD PTR [rsp+32], xmm7
@ -46,7 +46,7 @@
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
ALIGN 64 ALIGN 64
$main_loop_ryzen: cnv2_main_loop_${ALGO}_ryzen:
movdqa xmm5, XMMWORD PTR [r10+rbx] movdqa xmm5, XMMWORD PTR [r10+rbx]
movq xmm0, r11 movq xmm0, r11
movq xmm6, r8 movq xmm6, r8
@ -78,7 +78,7 @@ $main_loop_ryzen:
movdqa xmm0, xmm5 movdqa xmm0, xmm5
pxor xmm0, xmm3 pxor xmm0, xmm3
mov r10, r14 mov r10, r14
and r10d, 2097136 and r10d, ${MASK}
movdqa XMMWORD PTR [rdx], xmm0 movdqa XMMWORD PTR [rdx], xmm0
xor rsi, QWORD PTR [r10+rbx] xor rsi, QWORD PTR [r10+rbx]
lea r12, QWORD PTR [r10+rbx] lea r12, QWORD PTR [r10+rbx]
@ -103,10 +103,10 @@ $main_loop_ryzen:
sqrtsd xmm1, xmm0 sqrtsd xmm1, xmm0
movq rdi, xmm1 movq rdi, xmm1
test rdi, 524287 test rdi, 524287
je $sqrt_fixup_ryzen je sqrt_fixup_${ALGO}_ryzen
shr rdi, 19 shr rdi, 19
$sqrt_fixup_ryzen_ret: sqrt_fixup_${ALGO}_ryzen_ret:
mov rax, rsi mov rax, rsi
mul r14 mul r14
movq xmm1, rax movq xmm1, rax
@ -138,10 +138,10 @@ $sqrt_fixup_ryzen_ret:
mov QWORD PTR [r12+8], r11 mov QWORD PTR [r12+8], r11
mov r10, r8 mov r10, r8
xor r11, r13 xor r11, r13
and r10d, 2097136 and r10d, ${MASK}
movdqa xmm3, xmm5 movdqa xmm3, xmm5
dec ebp dec ebp
jne $main_loop_ryzen jne cnv2_main_loop_${ALGO}_ryzen
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
movaps xmm6, XMMWORD PTR [rsp+48] movaps xmm6, XMMWORD PTR [rsp+48]
@ -157,9 +157,9 @@ $sqrt_fixup_ryzen_ret:
pop r13 pop r13
pop r12 pop r12
pop rdi pop rdi
jmp $cnv2_main_loop_ryzen_endp jmp cnv2_main_loop_${ALGO}_ryzen_endp
$sqrt_fixup_ryzen: sqrt_fixup_${ALGO}_ryzen:
movq r9, xmm2 movq r9, xmm2
dec rdi dec rdi
mov edx, -1022 mov edx, -1022
@ -174,6 +174,6 @@ $sqrt_fixup_ryzen:
imul rcx, rax imul rcx, rax
sub rcx, r9 sub rcx, r9
adc rdi, 0 adc rdi, 0
jmp $sqrt_fixup_ryzen_ret jmp sqrt_fixup_${ALGO}_ryzen_ret
$cnv2_main_loop_ryzen_endp: cnv2_main_loop_${ALGO}_ryzen_endp:

View file

@ -47,7 +47,7 @@
mov rax, r8 mov rax, r8
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
and eax, 2097136 and eax, ${MASK}
movq xmm10, QWORD PTR [r10+96] movq xmm10, QWORD PTR [r10+96]
movq xmm0, rcx movq xmm0, rcx
mov rcx, QWORD PTR [r10+104] mov rcx, QWORD PTR [r10+104]
@ -57,10 +57,10 @@
mov QWORD PTR [rsp+240], r9 mov QWORD PTR [rsp+240], r9
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
movq xmm13, rcx movq xmm13, rcx
mov r12d, 524288 mov r12d, ${ITERATIONS}
ALIGN 64 ALIGN 64
cnv2_mainloop_soft_aes_sandybridge: cnv2_main_loop_${ALGO}_soft_aes_sandybridge:
movd xmm11, r12d movd xmm11, r12d
mov r12, QWORD PTR [r10+272] mov r12, QWORD PTR [r10+272]
lea r13, QWORD PTR [rax+r11] lea r13, QWORD PTR [rax+r11]
@ -148,7 +148,7 @@ cnv2_mainloop_soft_aes_sandybridge:
movdqu XMMWORD PTR [rdx+r11], xmm1 movdqu XMMWORD PTR [rdx+r11], xmm1
movq rdi, xmm6 movq rdi, xmm6
mov r10, rdi mov r10, rdi
and r10d, 2097136 and r10d, ${MASK}
xor edx, edx xor edx, edx
mov rax, rcx mov rax, rcx
shl rax, 32 shl rax, 32
@ -181,9 +181,9 @@ cnv2_mainloop_soft_aes_sandybridge:
sqrtsd xmm1, xmm0 sqrtsd xmm1, xmm0
movq rdx, xmm1 movq rdx, xmm1
test rdx, 524287 test rdx, 524287
je sqrt_fixup_soft_aes_sandybridge je sqrt_fixup_${ALGO}_soft_aes_sandybridge
psrlq xmm1, 19 psrlq xmm1, 19
sqrt_fixup_soft_aes_sandybridge_ret: sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret:
mov r9, r10 mov r9, r10
movdqa xmm13, xmm1 movdqa xmm13, xmm1
@ -219,12 +219,12 @@ sqrt_fixup_soft_aes_sandybridge_ret:
xor r8, rbx xor r8, rbx
mov rax, r8 mov rax, r8
mov QWORD PTR [r14+8], r9 mov QWORD PTR [r14+8], r9
and eax, 2097136 and eax, ${MASK}
xor r9, rbp xor r9, rbp
mov QWORD PTR [rsp+240], r9 mov QWORD PTR [rsp+240], r9
mov QWORD PTR [rsp+248], rax mov QWORD PTR [rsp+248], rax
sub r12d, 1 sub r12d, 1
jne cnv2_mainloop_soft_aes_sandybridge jne cnv2_main_loop_${ALGO}_soft_aes_sandybridge
ldmxcsr DWORD PTR [rsp+4] ldmxcsr DWORD PTR [rsp+4]
movaps xmm6, XMMWORD PTR [rsp+16] movaps xmm6, XMMWORD PTR [rsp+16]
@ -245,9 +245,9 @@ sqrt_fixup_soft_aes_sandybridge_ret:
pop rsi pop rsi
pop rbp pop rbp
pop rbx pop rbx
jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp jmp cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp
sqrt_fixup_soft_aes_sandybridge: sqrt_fixup_${ALGO}_soft_aes_sandybridge:
dec rdx dec rdx
mov r15d, -1022 mov r15d, -1022
shl r15, 32 shl r15, 32
@ -262,6 +262,6 @@ sqrt_fixup_soft_aes_sandybridge:
sub rcx, r9 sub rcx, r9
adc rdx, 0 adc rdx, 0
movq xmm1, rdx movq xmm1, rdx
jmp sqrt_fixup_soft_aes_sandybridge_ret jmp sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret
cnv2_mainloop_soft_aes_sandybridge_asm_endp: cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp:

View file

@ -52,6 +52,7 @@ int64_t Client::m_sequence = 1;
Client::Client(int id, const char *agent, IClientListener *listener) : Client::Client(int id, const char *agent, IClientListener *listener) :
m_quiet(false), m_quiet(false),
m_nicehash(false), m_nicehash(false),
m_donate(false),
m_agent(agent), m_agent(agent),
m_listener(listener), m_listener(listener),
m_id(id), m_id(id),
@ -231,7 +232,7 @@ bool Client::parseJob(const rapidjson::Value &params, int *code)
PowVariant powVariant = Options::i()->powVariant(); PowVariant powVariant = Options::i()->powVariant();
if (!Options::i()->forcePowVariant()) { if (!Options::i()->forcePowVariant() || m_donate) {
if (params.HasMember("algo")) { if (params.HasMember("algo")) {
std::string algo = params["algo"].GetString(); std::string algo = params["algo"].GetString();

View file

@ -67,6 +67,7 @@ public:
inline int id() const { return m_id; } inline int id() const { return m_id; }
inline uint16_t port() const { return m_url.port(); } inline uint16_t port() const { return m_url.port(); }
inline void setQuiet(bool quiet) { m_quiet = quiet; } inline void setQuiet(bool quiet) { m_quiet = quiet; }
inline void setDonate(bool donate) { m_donate = donate; }
inline void setRetryPause(int ms) { m_retryPause = ms; } inline void setRetryPause(int ms) { m_retryPause = ms; }
static void onConnected(uv_async_t *handle); static void onConnected(uv_async_t *handle);
@ -99,6 +100,7 @@ private:
bool m_quiet; bool m_quiet;
bool m_nicehash; bool m_nicehash;
bool m_donate;
char m_buf[2048]; char m_buf[2048];
char m_rpcId[64]; char m_rpcId[64];
char m_sendBuf[768]; char m_sendBuf[768];

View file

@ -150,9 +150,9 @@ PowVariant Job::powVariant() const
} else { } else {
return PowVariant::POW_V0; return PowVariant::POW_V0;
} }
} else if (m_powVariant == PowVariant::POW_XTL) { } else if (m_powVariant == PowVariant::POW_MSR) {
if (m_blob[0] > 5) { if (m_blob[0] > 8) {
return PowVariant::POW_XTL_V9; return PowVariant::POW_FAST_2;
} }
} }

View file

@ -67,7 +67,7 @@ public:
bool operator!=(const Job &other) const; bool operator!=(const Job &other) const;
private: private:
uint8_t m_blob[96]; // Max blob size is 84 (75 fixed + 9 variable), aligned to 96. https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk. uint8_t m_blob[MAX_BLOB_SIZE]; // Max blob size is 84 (75 fixed + 9 variable), aligned to 96. https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk.
bool m_nicehash; bool m_nicehash;
int m_poolId; int m_poolId;

View file

@ -81,6 +81,7 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) :
m_client->setUrl(url); m_client->setUrl(url);
m_client->setRetryPause(Options::i()->retryPause() * 1000); m_client->setRetryPause(Options::i()->retryPause() * 1000);
m_client->setQuiet(true); m_client->setQuiet(true);
m_client->setDonate(true);
delete url; delete url;

View file

@ -63,7 +63,7 @@ public:
State(size_t hashMultiplier) State(size_t hashMultiplier)
{ {
nonces = new uint32_t[hashMultiplier]; nonces = new uint32_t[hashMultiplier];
blob = new uint8_t[84 * hashMultiplier]; blob = new uint8_t[MAX_BLOB_SIZE * hashMultiplier];
for(size_t i=0; i<hashMultiplier; ++i) { for(size_t i=0; i<hashMultiplier; ++i) {
nonces[i] = 0; nonces[i] = 0;