Refactored ASM code
- Removed duplicate code - Autogenerated ASM files based on templates - Cleanup in naming
This commit is contained in:
parent
6574a8e844
commit
2ec65c7a20
63 changed files with 712 additions and 6907 deletions
|
@ -219,24 +219,7 @@ else()
|
|||
endif(WITH_CC_SERVER OR WITH_CC_CLIENT)
|
||||
|
||||
if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
enable_language(ASM_MASM)
|
||||
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm")
|
||||
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
|
||||
else()
|
||||
enable_language(ASM)
|
||||
|
||||
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
|
||||
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S")
|
||||
else()
|
||||
set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S")
|
||||
endif()
|
||||
|
||||
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
|
||||
endif()
|
||||
|
||||
add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE})
|
||||
set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C)
|
||||
include(cmake/asm.cmake)
|
||||
else()
|
||||
add_definitions(/DXMRIG_NO_ASM)
|
||||
endif(WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
|
||||
|
|
115
cmake/asm.cmake
Normal file
115
cmake/asm.cmake
Normal file
|
@ -0,0 +1,115 @@
|
|||
# CN v1 original
|
||||
set(ALGO "original")
|
||||
set(ITERATIONS "524288") #0x80000
|
||||
set(MASK "2097136") #0x1FFFF0
|
||||
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc")
|
||||
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc")
|
||||
|
||||
# CN v2 ORIGINAL
|
||||
set(ALGO "originalv2")
|
||||
set(ITERATIONS "524288") #0x80000
|
||||
set(MASK "2097136") #0x1FFFF0
|
||||
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ivybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_bulldozer.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_ryzen.inc")
|
||||
configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc")
|
||||
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ivybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_bulldozer.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_ryzen.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc")
|
||||
|
||||
# CN v1 FAST
|
||||
set(ALGO "fast")
|
||||
set(ITERATIONS "262144") #0x40000
|
||||
set(MASK "2097136") #0x1FFFF0
|
||||
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_fast_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_fast_soft_aes_sandybridge.inc")
|
||||
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_fast_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_fast_soft_aes_sandybridge.inc")
|
||||
|
||||
# CN v2 FAST
|
||||
set(ALGO "fastv2")
|
||||
set(ITERATIONS "262144") #0x40000
|
||||
set(MASK "2097136") #0x1FFFF0
|
||||
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_ivybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_bulldozer.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_ryzen.inc")
|
||||
configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_fastv2_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc")
|
||||
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_ivybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_bulldozer.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_ryzen.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_fastv2_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_fastv2_soft_aes_sandybridge.inc")
|
||||
|
||||
# CN LITE
|
||||
|
||||
set(ALGO "lite")
|
||||
set(ITERATIONS "262144") #0x40000
|
||||
set(MASK "1048560") #0xFFFF0
|
||||
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_lite_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_lite_soft_aes_sandybridge.inc")
|
||||
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_lite_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_lite_soft_aes_sandybridge.inc")
|
||||
|
||||
# CN UPX
|
||||
|
||||
set(ALGO "upx")
|
||||
set(ITERATIONS "131072") #0x20000
|
||||
set(MASK "1048560") #0xFFFF0
|
||||
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_upx_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv1_main_loop_upx_soft_aes_sandybridge.inc")
|
||||
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_upx_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv1_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv1_main_loop_upx_soft_aes_sandybridge.inc")
|
||||
|
||||
# CN V2 ULTRALITE
|
||||
set(ALGO "ultralite")
|
||||
set(ITERATIONS "65536") #0x10000
|
||||
set(MASK "131056") #0x1FFF0
|
||||
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_ivybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_bulldozer.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_ryzen.inc")
|
||||
configure_file("src/crypto/asm/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/cnv2_double_main_loop_ultralite_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc")
|
||||
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_ivybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_ivybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_bulldozer.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_bulldozer.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_ryzen.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc")
|
||||
configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc")
|
||||
|
||||
if (CMAKE_C_COMPILER_ID MATCHES MSVC)
|
||||
enable_language(ASM_MASM)
|
||||
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm")
|
||||
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
|
||||
else()
|
||||
enable_language(ASM)
|
||||
|
||||
if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
|
||||
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop_win_gcc.S")
|
||||
else()
|
||||
set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S")
|
||||
endif()
|
||||
|
||||
set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
|
||||
endif()
|
||||
|
||||
add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE})
|
||||
set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C)
|
|
@ -328,7 +328,7 @@ constexpr static const char *pow_variant_names[] = {
|
|||
"xhv",
|
||||
"rto",
|
||||
"xfh",
|
||||
"xtlv9",
|
||||
"fast2",
|
||||
"upx",
|
||||
"turtle"
|
||||
};
|
||||
|
@ -420,10 +420,7 @@ Options::Options(int argc, char **argv) :
|
|||
{
|
||||
m_pools.push_back(new Url());
|
||||
|
||||
parseConfig(Platform::defaultConfigName());
|
||||
|
||||
int key;
|
||||
|
||||
while (true) {
|
||||
key = getopt_long(argc, argv, short_options, options, nullptr);
|
||||
if (key < 0) {
|
||||
|
@ -440,6 +437,10 @@ Options::Options(int argc, char **argv) :
|
|||
return;
|
||||
}
|
||||
|
||||
if (!m_pools[0]->isValid() && (!m_ccHost || m_ccPort == 0)) {
|
||||
parseConfig(Platform::defaultConfigName());
|
||||
}
|
||||
|
||||
#ifdef XMRIG_CC_SERVER
|
||||
if (m_ccPort == 0) {
|
||||
fprintf(stderr, "No CC Server Port supplied. Exiting.\n");
|
||||
|
@ -1176,8 +1177,10 @@ bool Options::parsePowVariant(const char *powVariant)
|
|||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2") || !strcmp(powVariant, "half"))) {
|
||||
m_powVariant = POW_XTL_V9;
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "stellitev9") || !strcmp(powVariant, "xtlv2") ||
|
||||
!strcmp(powVariant, "half") || !strcmp(powVariant, "msr2") ||
|
||||
!strcmp(powVariant, "xtlv9"))) {
|
||||
m_powVariant = POW_FAST_2;
|
||||
break;
|
||||
}
|
||||
|
||||
|
@ -1186,7 +1189,7 @@ bool Options::parsePowVariant(const char *powVariant)
|
|||
break;
|
||||
}
|
||||
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "trtl")) {
|
||||
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "trtl") || !strcmp(powVariant, "turtlev2") || !strcmp(powVariant, "pico"))) {
|
||||
m_powVariant = POW_TURTLE;
|
||||
break;
|
||||
}
|
||||
|
|
|
@ -29,6 +29,8 @@
|
|||
#define MAX_NUM_HASH_BLOCKS 5
|
||||
#endif
|
||||
|
||||
#define MAX_BLOB_SIZE 128
|
||||
|
||||
#include <cstdint>
|
||||
#include <vector>
|
||||
|
||||
|
|
|
@ -35,7 +35,7 @@ enum PowVariant
|
|||
POW_XHV,
|
||||
POW_RTO,
|
||||
POW_XFH,
|
||||
POW_XTL_V9,
|
||||
POW_FAST_2,
|
||||
POW_UPX,
|
||||
POW_TURTLE,
|
||||
LAST_ITEM
|
||||
|
@ -65,8 +65,8 @@ inline std::string getPowVariantName(PowVariant powVariant)
|
|||
return "rto";
|
||||
case POW_XFH:
|
||||
return "xfh";
|
||||
case POW_XTL_V9:
|
||||
return "xtlv9";
|
||||
case POW_FAST_2:
|
||||
return "fast2";
|
||||
case POW_UPX:
|
||||
return "upx";
|
||||
case POW_TURTLE:
|
||||
|
@ -138,11 +138,11 @@ inline PowVariant parseVariant(const std::string variant)
|
|||
powVariant = PowVariant::POW_RTO;
|
||||
} else if (variant == "xfh" || variant == "freehaven" || variant == "faven") {
|
||||
powVariant = PowVariant::POW_XFH;
|
||||
} else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2" || variant == "half") {
|
||||
powVariant = PowVariant::POW_XTL_V9;
|
||||
} else if (variant == "xtlv9" || variant == "stellite_v9" || variant == "xtlv2" || variant == "half" || variant == "msr2" || variant == "fast2") {
|
||||
powVariant = PowVariant::POW_FAST_2;
|
||||
} else if (variant == "upx" || variant == "uplexa" || variant == "cn-upx") {
|
||||
powVariant = PowVariant::POW_UPX;
|
||||
} else if (variant == "turtle" || variant == "trtl") {
|
||||
} else if (variant == "turtle" || variant == "trtl" || variant == "pico" || variant == "turtlev2") {
|
||||
powVariant = PowVariant::POW_TURTLE;
|
||||
}
|
||||
|
||||
|
|
|
@ -70,7 +70,7 @@ static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVer
|
|||
CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_XTL_V9) {
|
||||
} else if (powVersion == PowVariant::POW_FAST_2) {
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
#else
|
||||
|
@ -124,7 +124,7 @@ static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powV
|
|||
CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
} else if (powVersion == PowVariant::POW_XTL_V9) {
|
||||
} else if (powVersion == PowVariant::POW_FAST_2) {
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
#else
|
||||
|
@ -237,6 +237,7 @@ static void cryptonight_super_lite_softaes(AsmOptimization asmOptimization, PowV
|
|||
|
||||
template <size_t NUM_HASH_BLOCKS>
|
||||
static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
|
||||
# if !defined(XMRIG_ARMv7)
|
||||
#if defined(XMRIG_ARM)
|
||||
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
#else
|
||||
|
@ -248,6 +249,7 @@ static void cryptonight_ultra_lite_aesni(AsmOptimization asmOptimization, PowVar
|
|||
CryptoNightMultiHash<0x10000, POW_DEFAULT_INDEX_SHIFT, MEMORY_ULTRA_LITE, 0x1FFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
|
||||
}
|
||||
#endif
|
||||
# endif
|
||||
}
|
||||
|
||||
template <size_t NUM_HASH_BLOCKS>
|
||||
|
@ -642,7 +644,7 @@ bool CryptoNight::selfTest(int algo)
|
|||
|
||||
// cnv8 + xtl aka cn-fast2
|
||||
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL_V9, test_input, 76, output, scratchPads);
|
||||
cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_FAST_2, test_input, 76, output, scratchPads);
|
||||
result = result && memcmp(output, test_output_xtl_v9, 32) == 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -1430,74 +1430,72 @@ public:
|
|||
|
||||
// single
|
||||
inline static void hashPowV3(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad)
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad)
|
||||
{
|
||||
const uint8_t* l;
|
||||
uint64_t* h;
|
||||
uint64_t al;
|
||||
uint64_t ah;
|
||||
uint64_t idx;
|
||||
__m128i bx0;
|
||||
__m128i bx1;
|
||||
keccak(input, (int) size, scratchPad[0]->state, 200);
|
||||
|
||||
keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
|
||||
const uint8_t* l0 = scratchPad[0]->memory;
|
||||
uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
|
||||
|
||||
l = scratchPad[0]->memory;
|
||||
h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
|
||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
|
||||
|
||||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
|
||||
uint64_t al0 = h0[0] ^h0[4];
|
||||
uint64_t ah0 = h0[1] ^h0[5];
|
||||
|
||||
al = h[0] ^ h[4];
|
||||
ah = h[1] ^ h[5];
|
||||
bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
|
||||
bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]);
|
||||
idx = h[0] ^ h[4];
|
||||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
|
||||
|
||||
uint64_t division_result_xmm0 = h[12];
|
||||
uint64_t sqrt_result0 = h[13];
|
||||
uint64_t idx0 = h0[0] ^h0[4];
|
||||
uint64_t division_result_xmm0 = h0[12];
|
||||
uint64_t sqrt_result0 = h0[13];
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
const __m128i ax = _mm_set_epi64x(ah, al);
|
||||
__m128i cx0;
|
||||
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
|
||||
|
||||
__m128i cx;
|
||||
if (SOFT_AES) {
|
||||
cx = soft_aesenc((uint32_t*) &l[idx & MASK], ax);
|
||||
cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
|
||||
} else {
|
||||
cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
|
||||
cx = _mm_aesenc_si128(cx, ax);
|
||||
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
|
||||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
|
||||
}
|
||||
|
||||
SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax)
|
||||
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
|
||||
|
||||
_mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx));
|
||||
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
|
||||
|
||||
idx0 = EXTRACT64(cx0);
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l[idx & MASK])[0];
|
||||
ch = ((uint64_t*) &l[idx & MASK])[1];
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
|
||||
INTEGER_MATH_V2(0, cl, cx)
|
||||
INTEGER_MATH_V2(0, cl, cx0);
|
||||
|
||||
lo = __umul128(idx, cl, &hi);
|
||||
lo = __umul128(idx0, cl, &hi);
|
||||
|
||||
SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi)
|
||||
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
|
||||
|
||||
al += hi;
|
||||
ah += lo;
|
||||
al0 += hi;
|
||||
ah0 += lo;
|
||||
|
||||
((uint64_t*) &l[idx & MASK])[0] = al;
|
||||
((uint64_t*) &l[idx & MASK])[1] = ah;
|
||||
((uint64_t*) &l0[idx0 & MASK])[0] = al0;
|
||||
((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
|
||||
|
||||
ah ^= ch;
|
||||
al ^= cl;
|
||||
idx = al;
|
||||
ah0 ^= ch;
|
||||
al0 ^= cl;
|
||||
idx0 = al0;
|
||||
|
||||
bx0 = cx;
|
||||
bx10 = bx00;
|
||||
bx00 = cx0;
|
||||
}
|
||||
|
||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
|
||||
keccakf(h, 24);
|
||||
cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
|
||||
|
||||
keccakf(h0, 24);
|
||||
|
||||
extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
|
||||
}
|
||||
|
||||
|
@ -2016,9 +2014,9 @@ public:
|
|||
|
||||
// double
|
||||
inline static void hashPowV3(const uint8_t* __restrict__ input,
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad)
|
||||
size_t size,
|
||||
uint8_t* __restrict__ output,
|
||||
ScratchPad** __restrict__ scratchPad)
|
||||
{
|
||||
keccak(input, (int) size, scratchPad[0]->state, 200);
|
||||
keccak(input + size, (int) size, scratchPad[1]->state, 200);
|
||||
|
@ -2037,9 +2035,9 @@ public:
|
|||
uint64_t ah1 = h1[1] ^h1[5];
|
||||
|
||||
__m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
|
||||
__m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
|
||||
|
||||
__m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
|
||||
|
||||
__m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
|
||||
__m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
|
||||
|
||||
uint64_t idx0 = h0[0] ^h0[4];
|
||||
|
@ -2048,15 +2046,16 @@ public:
|
|||
uint64_t division_result_xmm0 = h0[12];
|
||||
uint64_t division_result_xmm1 = h1[12];
|
||||
|
||||
uint64_t sqrt_result0 = h0[13];
|
||||
uint64_t sqrt_result1 = h1[13];
|
||||
uint64_t sqrt_result0 = h0[13];
|
||||
uint64_t sqrt_result1 = h1[13];
|
||||
|
||||
for (size_t i = 0; i < ITERATIONS; i++) {
|
||||
__m128i cx0;
|
||||
__m128i cx1;
|
||||
|
||||
const __m128i ax0 = _mm_set_epi64x(ah0, al0);
|
||||
const __m128i ax1 = _mm_set_epi64x(ah1, al1);
|
||||
|
||||
__m128i cx0;
|
||||
__m128i cx1;
|
||||
if (SOFT_AES) {
|
||||
cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
|
||||
cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1);
|
||||
|
@ -2064,8 +2063,8 @@ public:
|
|||
cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
|
||||
cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
|
||||
|
||||
cx0 = _mm_aesenc_si128(cx0, ax0);
|
||||
cx1 = _mm_aesenc_si128(cx1, ax1);
|
||||
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
|
||||
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
|
||||
}
|
||||
|
||||
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
|
||||
|
@ -2077,9 +2076,6 @@ public:
|
|||
idx0 = EXTRACT64(cx0);
|
||||
idx1 = EXTRACT64(cx1);
|
||||
|
||||
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
|
||||
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
|
||||
|
||||
uint64_t hi, lo, cl, ch;
|
||||
cl = ((uint64_t*) &l0[idx0 & MASK])[0];
|
||||
ch = ((uint64_t*) &l0[idx0 & MASK])[1];
|
||||
|
@ -2103,6 +2099,7 @@ public:
|
|||
bx10 = bx00;
|
||||
bx00 = cx0;
|
||||
|
||||
|
||||
cl = ((uint64_t*) &l1[idx1 & MASK])[0];
|
||||
ch = ((uint64_t*) &l1[idx1 & MASK])[1];
|
||||
|
||||
|
|
|
@ -50,29 +50,34 @@ extern "C"
|
|||
#include "crypto/c_skein.h"
|
||||
|
||||
#ifndef XMRIG_NO_ASM
|
||||
void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_litev1_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fast_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
void cn_fastv2_mainloop_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_mainloop_ryzen_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
void cn_liteupx_mainloop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_ultralitev2_mainloop_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cn_ultralitev2_mainloop_ryzen_asm(ScratchPad* ctx0);
|
||||
void cn_ultralitev2_mainloop_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cn_ultralitev2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fast_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_litev1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_fastv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_liteupx_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx);
|
||||
void cnv1_main_loop_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv1_main_loop_lite_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv1_main_loop_fast_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv1_main_loop_upx_sandybridge_asm(ScratchPad* ctx0);
|
||||
|
||||
void cnv2_main_loop_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_ryzen_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cnv2_double_main_loop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
|
||||
void cnv2_main_loop_fastv2_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_fastv2_ryzen_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_fastv2_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cnv2_double_main_loop_fastv2_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
|
||||
void cnv2_main_loop_ultralite_ivybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_ultralite_ryzen_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_ultralite_bulldozer_asm(ScratchPad* ctx0);
|
||||
void cnv2_double_main_loop_ultralite_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
|
||||
|
||||
void cnv1_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv1_main_loop_lite_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv1_main_loop_fast_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv1_main_loop_upx_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
|
||||
void cnv2_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(ScratchPad* ctx0);
|
||||
void cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(ScratchPad* ctx);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -1437,28 +1442,28 @@ public:
|
|||
if (SOFT_AES) {
|
||||
scratchPad[0]->t_fn = (const uint32_t*)saes_table;
|
||||
|
||||
if (ITERATIONS == 0x80000) {
|
||||
cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
if (ITERATIONS == 0x40000) {
|
||||
if (MASK == 0x1FFFF0) {
|
||||
cn_fast_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
cnv1_main_loop_fast_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cn_litev1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
cnv1_main_loop_lite_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else if (ITERATIONS == 0x20000) {
|
||||
cnv1_main_loop_upx_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
cnv1_main_loop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else {
|
||||
if (ITERATIONS == 0x80000) {
|
||||
cnv1_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
if (ITERATIONS == 0x40000) {
|
||||
if (MASK == 0x1FFFF0) {
|
||||
cn_fast_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
cnv1_main_loop_fast_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cn_litev1_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
cnv1_main_loop_lite_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else if (ITERATIONS == 0x20000) {
|
||||
cnv1_main_loop_upx_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cn_liteupx_mainloop_sandybridge_asm(scratchPad[0]);
|
||||
cnv1_main_loop_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -1560,36 +1565,36 @@ public:
|
|||
scratchPad[0]->input = input;
|
||||
scratchPad[0]->t_fn = (const uint32_t*)saes_table;
|
||||
if (ITERATIONS == 0x40000) {
|
||||
cn_fastv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
cnv2_main_loop_fastv2_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x10000) {
|
||||
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
cnv2_main_loop_ultralite_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
cnv2_main_loop_soft_aes_sandybridge_asm(scratchPad[0]);
|
||||
}
|
||||
} else {
|
||||
if (ITERATIONS == 0x10000) {
|
||||
cn_ultralitev2_mainloop_ivybridge_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
cn_fastv2_mainloop_ivybridge_asm(scratchPad[0]);
|
||||
if (ITERATIONS == 0x40000) {
|
||||
cnv2_main_loop_fastv2_ivybridge_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x10000) {
|
||||
cnv2_main_loop_ultralite_ivybridge_asm(scratchPad[0]);
|
||||
} else {
|
||||
cnv2_mainloop_ivybridge_asm(scratchPad[0]);
|
||||
cnv2_main_loop_ivybridge_asm(scratchPad[0]);
|
||||
}
|
||||
}
|
||||
} else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
|
||||
if (ITERATIONS == 0x10000) {
|
||||
cn_ultralitev2_mainloop_ryzen_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
cn_fastv2_mainloop_ryzen_asm(scratchPad[0]);
|
||||
if (ITERATIONS == 0x40000) {
|
||||
cnv2_main_loop_fastv2_ryzen_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x10000) {
|
||||
cnv2_main_loop_ultralite_ryzen_asm(scratchPad[0]);
|
||||
} else {
|
||||
cnv2_mainloop_ryzen_asm(scratchPad[0]);
|
||||
cnv2_main_loop_ryzen_asm(scratchPad[0]);
|
||||
}
|
||||
} else if (asmOptimization == AsmOptimization::ASM_BULLDOZER) {
|
||||
if (ITERATIONS == 0x10000) {
|
||||
cn_ultralitev2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
cn_fastv2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||
if (ITERATIONS == 0x40000) {
|
||||
cnv2_main_loop_fastv2_bulldozer_asm(scratchPad[0]);
|
||||
} else if (ITERATIONS == 0x10000) {
|
||||
cnv2_main_loop_ultralite_bulldozer_asm(scratchPad[0]);
|
||||
} else {
|
||||
cnv2_mainloop_bulldozer_asm(scratchPad[0]);
|
||||
cnv2_main_loop_bulldozer_asm(scratchPad[0]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
@ -2306,12 +2311,12 @@ public:
|
|||
cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
|
||||
|
||||
#ifndef XMRIG_NO_ASM
|
||||
if (ITERATIONS == 0x10000) {
|
||||
cn_ultralitev2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||
} else if (ITERATIONS == 0x40000) {
|
||||
cn_fastv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||
if (ITERATIONS == 0x40000) {
|
||||
cnv2_double_main_loop_fastv2_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||
} else if (ITERATIONS == 0x10000) {
|
||||
cnv2_double_main_loop_ultralite_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||
} else {
|
||||
cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||
cnv2_double_main_loop_sandybridge_asm(scratchPad[0], scratchPad[1]);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -1,166 +0,0 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 262144
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_fast_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 2097136
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_fast_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -1,180 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_fast2_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movq r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movq xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_fast2_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_fast2_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_fast2_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_fast2_bulldozer_endp
|
||||
|
||||
sqrt_fixup_fast2_bulldozer:
|
||||
movq r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_fast2_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_fast2_bulldozer_endp:
|
|
@ -1,183 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_fast2_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movq r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
|
||||
div r9
|
||||
movq xmm0, rax
|
||||
movq xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movq r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_fast2_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_fast2_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_fast2_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_fast2_ryzen_endp
|
||||
|
||||
$sqrt_fixup_fast2_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_fast2_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ryzen_endp:
|
|
@ -1,271 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 152
|
||||
|
||||
stmxcsr DWORD PTR [rsp+4]
|
||||
mov DWORD PTR [rsp], 24448
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 262144
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+248]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
movq rbx, xmm10
|
||||
xor rbx, rax
|
||||
lea r9, QWORD PTR [rcx+rcx]
|
||||
add r9d, edi
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
mov ecx, -2147483647
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
or r9, rcx
|
||||
movdqa xmm0, xmm6
|
||||
movaps xmm1, xmm9
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
div r9
|
||||
shl rdx, 32
|
||||
mov eax, eax
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rdi]
|
||||
movq xmm10, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_fast2_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+240]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+224]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_fast2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 152
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp
|
||||
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+r15+1]
|
||||
add rax, r15
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp:
|
|
@ -1,74 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 131072
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 1048560
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_liteupx_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 1048560
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 1048560
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_liteupx_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
|
@ -1,166 +0,0 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 1048560
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 131072
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 1048560
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 1048560
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_liteupx_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -1,74 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 262144
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 1048560
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_litev1_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 1048560
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 1048560
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_litev1_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
|
@ -1,166 +0,0 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 1048560
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 262144
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_litev1_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 1048560
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 1048560
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_litev1_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -7,40 +7,44 @@
|
|||
# define FN_PREFIX(fn) fn
|
||||
.section .text
|
||||
#endif
|
||||
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_main_loop_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm)
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv1_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv1_mainloop_sandybridge.inc"
|
||||
#include "cnv1_main_loop_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -49,10 +53,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_litev1_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_litev1_mainloop_sandybridge.inc"
|
||||
#include "cnv1_main_loop_lite_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -61,10 +65,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fast_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fast_mainloop_sandybridge.inc"
|
||||
#include "cnv1_main_loop_fast_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -73,7 +77,19 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv1_main_loop_upx_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_main_loop_ivybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_ivybridge.inc"
|
||||
|
@ -85,7 +101,7 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
||||
FN_PREFIX(cnv2_main_loop_ryzen_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_ryzen.inc"
|
||||
|
@ -97,7 +113,7 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||
FN_PREFIX(cnv2_main_loop_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_bulldozer.inc"
|
||||
|
@ -109,7 +125,7 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv2_double_main_loop_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
mov rdx, rsi
|
||||
|
@ -122,10 +138,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm):
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_main_loop_ivybridge.inc"
|
||||
#include "cnv2_main_loop_fastv2_ivybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -134,10 +150,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_ryzen_asm):
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_main_loop_ryzen.inc"
|
||||
#include "cnv2_main_loop_fastv2_ryzen.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -146,10 +162,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm):
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_main_loop_bulldozer.inc"
|
||||
#include "cnv2_main_loop_fastv2_bulldozer.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -158,11 +174,11 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
mov rdx, rsi
|
||||
#include "cn_fastv2_double_main_loop_sandybridge.inc"
|
||||
#include "cnv2_double_main_loop_fastv2_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -171,10 +187,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_liteupx_mainloop_sandybridge.inc"
|
||||
#include "cnv2_main_loop_ultralite_ivybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -183,10 +199,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm):
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_ultralitev2_main_loop_ivybridge.inc"
|
||||
#include "cnv2_main_loop_ultralite_ryzen.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -195,11 +211,23 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_ultralite_bulldozer.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
mov rdx, rsi
|
||||
#include "cn_ultralitev2_double_main_loop_sandybridge.inc"
|
||||
#include "cnv2_double_main_loop_ultralite_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -208,10 +236,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm):
|
||||
FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_ultralitev2_main_loop_ryzen.inc"
|
||||
#include "cnv1_main_loop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -220,34 +248,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_ultralitev2_main_loop_bulldozer.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv1_mainloop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_litev1_mainloop_soft_aes_sandybridge.inc"
|
||||
#include "cnv1_main_loop_lite_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -256,10 +260,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fast_mainloop_soft_aes_sandybridge.inc"
|
||||
#include "cnv1_main_loop_fast_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -268,34 +272,10 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_mainloop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_fastv2_mainloop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
||||
#include "cnv1_main_loop_upx_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
@ -304,9 +284,34 @@ ALIGN 16
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm):
|
||||
FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cnv2_main_loop_fastv2_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
||||
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm):
|
||||
sub rsp, 48
|
||||
mov rcx, rdi
|
||||
#include "cn_ultralitev2_mainloop_soft_aes_sandybridge.inc"
|
||||
#include "cnv2_main_loop_ultralite_soft_aes_sandybridge.inc"
|
||||
add rsp, 48
|
||||
ret 0
|
|
@ -1,414 +0,0 @@
|
|||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 65536
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 131056
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 131056
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
main_loop_double_ultralitev2_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 131056
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 131056
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 131056
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_ultralitev2_sandybridge
|
||||
div_fix_1_ret_ultralitev2_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_ultralitev2_sandybridge
|
||||
div_fix_2_ret_ultralitev2_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_ultralitev2_sandybridge
|
||||
sqrt_fix_1_ret_ultralitev2_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_ultralitev2_sandybridge
|
||||
sqrt_fix_2_ret_ultralitev2_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 131056
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_ultralitev2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp
|
||||
|
||||
div_fix_1_ultralitev2_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_ultralitev2_sandybridge
|
||||
|
||||
div_fix_2_ultralitev2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_ultralitev2_sandybridge
|
||||
|
||||
sqrt_fix_1_ultralitev2_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_ultralitev2_sandybridge
|
||||
|
||||
sqrt_fix_2_ultralitev2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_ultralitev2_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp:
|
|
@ -1,180 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 65536
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 131056
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_ultralitev2_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movq r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 131056
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movq xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_ultralitev2_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_ultralitev2_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 131056
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_ultralitev2_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_ultralitev2_bulldozer_endp
|
||||
|
||||
sqrt_fixup_ultralitev2_bulldozer:
|
||||
movq r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_ultralitev2_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_ultralitev2_bulldozer_endp:
|
|
@ -1,186 +0,0 @@
|
|||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 65536
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 131056
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_ultralitev2_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 131056
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_ultralitev2_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_ultralitev2_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 131056
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_ultralitev2_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_ultralitev2_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_ultralitev2_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_ultralitev2_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_ultralitev2_ivybridge_endp:
|
|
@ -1,183 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 65536
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 131056
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_ultralitev2_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movq r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 131056
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
|
||||
div r9
|
||||
movq xmm0, rax
|
||||
movq xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movq r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_ultralitev2_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_ultralitev2_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 131056
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_ultralitev2_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_ultralitev2_ryzen_endp
|
||||
|
||||
$sqrt_fixup_ultralitev2_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_ultralitev2_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_ultralitev2_ryzen_endp:
|
|
@ -1,271 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 152
|
||||
|
||||
stmxcsr DWORD PTR [rsp+4]
|
||||
mov DWORD PTR [rsp], 24448
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 131056
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 262144
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cnv2_mainloop_soft_aes_ultralitev2_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+248]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 131056
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
movq rbx, xmm10
|
||||
xor rbx, rax
|
||||
lea r9, QWORD PTR [rcx+rcx]
|
||||
add r9d, edi
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
mov ecx, -2147483647
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
or r9, rcx
|
||||
movdqa xmm0, xmm6
|
||||
movaps xmm1, xmm9
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
div r9
|
||||
shl rdx, 32
|
||||
mov eax, eax
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rdi]
|
||||
movq xmm10, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_ultralitev2_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+240]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+224]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 131056
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 152
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp
|
||||
|
||||
sqrt_fixup_soft_aes_ultralitev2_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+r15+1]
|
||||
add rax, r15
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp:
|
|
@ -5,7 +5,7 @@
|
|||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 262144
|
||||
mov ebp, ${ITERATIONS}
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
|
@ -18,7 +18,7 @@
|
|||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
|
@ -30,14 +30,14 @@
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cn_fast_mainloop_sandybridge:
|
||||
cnv1_main_loop_${ALGO}_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 2097136
|
||||
and r9d, ${MASK}
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
|
@ -60,11 +60,11 @@ cn_fast_mainloop_sandybridge:
|
|||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_fast_mainloop_sandybridge
|
||||
jne cnv1_main_loop_${ALGO}_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
|
@ -26,7 +26,7 @@
|
|||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
|
@ -38,14 +38,14 @@
|
|||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 524288
|
||||
mov eax, ${ITERATIONS}
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cnv1_mainloop_soft_aes_sandybridge:
|
||||
cnv1_main_loop_${ALGO}_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
|
@ -118,7 +118,7 @@ cnv1_mainloop_soft_aes_sandybridge:
|
|||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
|
@ -145,10 +145,10 @@ cnv1_mainloop_soft_aes_sandybridge:
|
|||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cnv1_mainloop_soft_aes_sandybridge
|
||||
jne cnv1_main_loop_${ALGO}_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
|
@ -1,74 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 524288
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 2097136
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cnv1_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 2097136
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cnv1_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
|
@ -1,414 +0,0 @@
|
|||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 524288
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
main_loop_double_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_sandybridge
|
||||
div_fix_1_ret_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_sandybridge
|
||||
div_fix_2_ret_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_sandybridge
|
||||
sqrt_fix_1_ret_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_sandybridge
|
||||
sqrt_fix_2_ret_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_sandybridge_endp
|
||||
|
||||
div_fix_1_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_sandybridge
|
||||
|
||||
div_fix_2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_sandybridge
|
||||
|
||||
sqrt_fix_1_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_sandybridge
|
||||
|
||||
sqrt_fix_2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_sandybridge_endp:
|
|
@ -18,7 +18,7 @@
|
|||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 262144
|
||||
mov r14d, 524288
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
|
@ -99,7 +99,7 @@
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
main_loop_double_fast2_sandybridge:
|
||||
cnv2_double_main_loop_${ALGO}_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
|
@ -253,8 +253,8 @@ main_loop_double_fast2_sandybridge:
|
|||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_fast2_sandybridge
|
||||
div_fix_1_ret_fast2_sandybridge:
|
||||
js div_fix_1_${ALGO}_sandybridge
|
||||
div_fix_1_ret_${ALGO}_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
|
@ -262,8 +262,8 @@ div_fix_1_ret_fast2_sandybridge:
|
|||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_fast2_sandybridge
|
||||
div_fix_2_ret_fast2_sandybridge:
|
||||
js div_fix_2_${ALGO}_sandybridge
|
||||
div_fix_2_ret_${ALGO}_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
|
@ -279,15 +279,15 @@ div_fix_2_ret_fast2_sandybridge:
|
|||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_fast2_sandybridge
|
||||
sqrt_fix_1_ret_fast2_sandybridge:
|
||||
je sqrt_fix_1_${ALGO}_sandybridge
|
||||
sqrt_fix_1_ret_${ALGO}_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_fast2_sandybridge
|
||||
sqrt_fix_2_ret_fast2_sandybridge:
|
||||
je sqrt_fix_2_${ALGO}_sandybridge
|
||||
sqrt_fix_2_ret_${ALGO}_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
|
@ -335,7 +335,7 @@ sqrt_fix_2_ret_fast2_sandybridge:
|
|||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_fast2_sandybridge
|
||||
jne cnv2_double_main_loop_${ALGO}_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
|
@ -358,19 +358,19 @@ sqrt_fix_2_ret_fast2_sandybridge:
|
|||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp
|
||||
jmp cnv2_double_main_loop_${ALGO}_sandybridge_endp
|
||||
|
||||
div_fix_1_fast2_sandybridge:
|
||||
div_fix_1_${ALGO}_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_fast2_sandybridge
|
||||
jmp div_fix_1_ret_${ALGO}_sandybridge
|
||||
|
||||
div_fix_2_fast2_sandybridge:
|
||||
div_fix_2_${ALGO}_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_fast2_sandybridge
|
||||
jmp div_fix_2_ret_${ALGO}_sandybridge
|
||||
|
||||
sqrt_fix_1_fast2_sandybridge:
|
||||
sqrt_fix_1_${ALGO}_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
|
@ -389,9 +389,9 @@ sqrt_fix_1_fast2_sandybridge:
|
|||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_fast2_sandybridge
|
||||
jmp sqrt_fix_1_ret_${ALGO}_sandybridge
|
||||
|
||||
sqrt_fix_2_fast2_sandybridge:
|
||||
sqrt_fix_2_${ALGO}_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
|
@ -409,6 +409,6 @@ sqrt_fix_2_fast2_sandybridge:
|
|||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_fast2_sandybridge
|
||||
jmp sqrt_fix_2_ret_${ALGO}_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_fast2_sandybridge_endp:
|
||||
cnv2_double_main_loop_${ALGO}_sandybridge_endp:
|
|
@ -15,7 +15,7 @@
|
|||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov ebp, ${ITERATIONS}
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
|
@ -31,7 +31,7 @@
|
|||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
|
@ -46,7 +46,7 @@
|
|||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_bulldozer:
|
||||
cnv2_main_loop_${ALGO}_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
|
@ -83,7 +83,7 @@ cnv2_main_loop_bulldozer:
|
|||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
|
@ -103,10 +103,10 @@ cnv2_main_loop_bulldozer:
|
|||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_bulldozer
|
||||
je sqrt_fixup_${ALGO}_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_bulldozer_ret:
|
||||
sqrt_fixup_${ALGO}_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
|
@ -138,10 +138,10 @@ sqrt_fixup_bulldozer_ret:
|
|||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_bulldozer
|
||||
jne cnv2_main_loop_${ALGO}_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
|
@ -157,9 +157,9 @@ sqrt_fixup_bulldozer_ret:
|
|||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_bulldozer_endp
|
||||
jmp cnv2_main_loop_${ALGO}_bulldozer_endp
|
||||
|
||||
sqrt_fixup_bulldozer:
|
||||
sqrt_fixup_${ALGO}_bulldozer:
|
||||
movq r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
|
@ -175,6 +175,6 @@ sqrt_fixup_bulldozer:
|
|||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_bulldozer_ret
|
||||
jmp sqrt_fixup_${ALGO}_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_bulldozer_endp:
|
||||
cnv2_main_loop_${ALGO}_bulldozer_endp:
|
|
@ -1,186 +0,0 @@
|
|||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 524288
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_ivybridge_endp:
|
|
@ -15,7 +15,7 @@
|
|||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 262144
|
||||
mov esi, ${ITERATIONS}
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
|
@ -35,7 +35,7 @@
|
|||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
|
@ -52,7 +52,7 @@
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_fast2_ivybridge:
|
||||
cnv2_main_loop_${ALGO}_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
|
@ -66,7 +66,7 @@ $main_loop_fast2_ivybridge:
|
|||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
and r9d, ${MASK}
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
|
@ -109,9 +109,9 @@ $main_loop_fast2_ivybridge:
|
|||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_fast2_ivybridge
|
||||
je sqrt_fixup_${ALGO}_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_fast2_ivybridge_ret:
|
||||
sqrt_fixup_${ALGO}_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
|
@ -122,7 +122,7 @@ $sqrt_fixup_fast2_ivybridge_ret:
|
|||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
and edi, ${MASK}
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
|
@ -147,7 +147,7 @@ $sqrt_fixup_fast2_ivybridge_ret:
|
|||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_fast2_ivybridge
|
||||
jne cnv2_main_loop_${ALGO}_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
|
@ -162,9 +162,9 @@ $sqrt_fixup_fast2_ivybridge_ret:
|
|||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_fast2_ivybridge_endp
|
||||
jmp cnv2_main_loop_${ALGO}_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_fast2_ivybridge:
|
||||
sqrt_fixup_${ALGO}_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
|
@ -181,6 +181,6 @@ $sqrt_fixup_fast2_ivybridge:
|
|||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_fast2_ivybridge_ret
|
||||
jmp sqrt_fixup_${ALGO}_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ivybridge_endp:
|
||||
cnv2_main_loop_${ALGO}_ivybridge_endp:
|
|
@ -15,7 +15,7 @@
|
|||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov ebp, ${ITERATIONS}
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
|
@ -31,7 +31,7 @@
|
|||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
|
@ -50,7 +50,7 @@
|
|||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
$main_loop_ryzen:
|
||||
cnv2_main_loop_${ALGO}_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
|
@ -82,7 +82,7 @@ $main_loop_ryzen:
|
|||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
|
@ -107,10 +107,10 @@ $main_loop_ryzen:
|
|||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_ryzen
|
||||
je sqrt_fixup_${ALGO}_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_ryzen_ret:
|
||||
sqrt_fixup_${ALGO}_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
|
@ -142,10 +142,10 @@ $sqrt_fixup_ryzen_ret:
|
|||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_ryzen
|
||||
jne cnv2_main_loop_${ALGO}_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
|
@ -161,9 +161,9 @@ $sqrt_fixup_ryzen_ret:
|
|||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_ryzen_endp
|
||||
jmp cnv2_main_loop_${ALGO}_ryzen_endp
|
||||
|
||||
$sqrt_fixup_ryzen:
|
||||
sqrt_fixup_${ALGO}_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
|
@ -178,6 +178,6 @@ $sqrt_fixup_ryzen:
|
|||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_ryzen_ret
|
||||
jmp sqrt_fixup_${ALGO}_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_ryzen_endp:
|
||||
cnv2_main_loop_${ALGO}_ryzen_endp:
|
|
@ -47,7 +47,7 @@
|
|||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
and eax, ${MASK}
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
|
@ -57,14 +57,14 @@
|
|||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 524288
|
||||
mov r12d, ${ITERATIONS}
|
||||
|
||||
#ifdef __APPLE__
|
||||
ALIGN 16
|
||||
#else
|
||||
ALIGN 64
|
||||
#endif
|
||||
cnv2_mainloop_soft_aes_sandybridge:
|
||||
cnv2_main_loop_${ALGO}_soft_aes_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
|
@ -152,7 +152,7 @@ cnv2_mainloop_soft_aes_sandybridge:
|
|||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
|
@ -185,9 +185,9 @@ cnv2_mainloop_soft_aes_sandybridge:
|
|||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_sandybridge
|
||||
je sqrt_fixup_${ALGO}_soft_aes_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_sandybridge_ret:
|
||||
sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
|
@ -223,12 +223,12 @@ sqrt_fixup_soft_aes_sandybridge_ret:
|
|||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
and eax, ${MASK}
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_sandybridge
|
||||
jne cnv2_main_loop_${ALGO}_soft_aes_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
|
@ -249,9 +249,9 @@ sqrt_fixup_soft_aes_sandybridge_ret:
|
|||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp
|
||||
jmp cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp
|
||||
|
||||
sqrt_fixup_soft_aes_sandybridge:
|
||||
sqrt_fixup_${ALGO}_soft_aes_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
|
@ -266,6 +266,6 @@ sqrt_fixup_soft_aes_sandybridge:
|
|||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_sandybridge_ret
|
||||
jmp sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_sandybridge_asm_endp:
|
||||
cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp:
|
|
@ -1,70 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 262144
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 2097136
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 2097136
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_fast_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
|
@ -1,162 +0,0 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 262144
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 2097136
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 2097136
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_fast_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -1,410 +0,0 @@
|
|||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 262144
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 2097136
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN 64
|
||||
main_loop_double_fast2_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 2097136
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_fast2_sandybridge
|
||||
div_fix_1_ret_fast2_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_fast2_sandybridge
|
||||
div_fix_2_ret_fast2_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_fast2_sandybridge
|
||||
sqrt_fix_1_ret_fast2_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_fast2_sandybridge
|
||||
sqrt_fix_2_ret_fast2_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_fast2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_fast2_sandybridge_endp
|
||||
|
||||
div_fix_1_fast2_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_fast2_sandybridge
|
||||
|
||||
div_fix_2_fast2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_fast2_sandybridge
|
||||
|
||||
sqrt_fix_1_fast2_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_fast2_sandybridge
|
||||
|
||||
sqrt_fix_2_fast2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_fast2_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_fast2_sandybridge_endp:
|
|
@ -1,180 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_fast2_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movd xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movd r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movd xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movd rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_fast2_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_fast2_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movd xmm1, rax
|
||||
movd xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_fast2_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_fast2_bulldozer_endp
|
||||
|
||||
sqrt_fixup_fast2_bulldozer:
|
||||
movd r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_fast2_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_fast2_bulldozer_endp:
|
|
@ -1,182 +0,0 @@
|
|||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_fast2_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_fast2_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_fast2_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_fast2_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_fast2_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_fast2_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_fast2_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ivybridge_endp:
|
|
@ -1,179 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 262144
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_fast2_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movq r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
|
||||
div r9
|
||||
movq xmm0, rax
|
||||
movq xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movq r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_fast2_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_fast2_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_fast2_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_fast2_ryzen_endp
|
||||
|
||||
$sqrt_fixup_fast2_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_fast2_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_fast2_ryzen_endp:
|
|
@ -1,267 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 152
|
||||
|
||||
stmxcsr DWORD PTR [rsp+4]
|
||||
mov DWORD PTR [rsp], 24448
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 262144
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+248]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
movq rbx, xmm10
|
||||
xor rbx, rax
|
||||
lea r9, QWORD PTR [rcx+rcx]
|
||||
add r9d, edi
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
mov ecx, -2147483647
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
or r9, rcx
|
||||
movdqa xmm0, xmm6
|
||||
movaps xmm1, xmm9
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
div r9
|
||||
shl rdx, 32
|
||||
mov eax, eax
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rdi]
|
||||
movq xmm10, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_fast2_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+240]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+224]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_fast2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 152
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp
|
||||
|
||||
sqrt_fixup_soft_aes_fast2_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+r15+1]
|
||||
add rax, r15
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_fast2_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_fast2_sandybridge_asm_endp:
|
|
@ -1,70 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 131072
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 1048560
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 1048560
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 1048560
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_litev1_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
|
@ -1,162 +0,0 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 1048560
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 131072
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 1048560
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 1048560
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_litev1_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -1,162 +0,0 @@
|
|||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 72
|
||||
|
||||
movaps XMMWORD PTR [rsp], xmm6
|
||||
movaps XMMWORD PTR [rsp+16], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
movaps XMMWORD PTR [rsp+48], xmm9
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm4, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov r13, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 1048560
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
movq xmm8, rdi
|
||||
punpcklqdq xmm4, xmm0
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
|
||||
movq xmm6, rcx
|
||||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 262144
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
mov r10d, DWORD PTR [rdx+rdi+4]
|
||||
mov ebp, DWORD PTR [rdx+rdi+12]
|
||||
mov r14d, DWORD PTR [rdx+rdi+8]
|
||||
mov rdx, QWORD PTR [rsp+64]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
add ebp, 256
|
||||
movd xmm1, r11d
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movq rdi, xmm8
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
punpckldq xmm2, xmm1
|
||||
movq xmm1, r8
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
xor eax, r15d
|
||||
movd xmm3, eax
|
||||
movq rax, xmm7
|
||||
punpckldq xmm3, xmm0
|
||||
movq xmm0, r13
|
||||
punpcklqdq xmm1, xmm0
|
||||
punpckldq xmm3, xmm2
|
||||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 1048560
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rcx, xmm0
|
||||
movzx ecx, cl
|
||||
mov cl, BYTE PTR [rcx+rax]
|
||||
mov BYTE PTR [rdi+rdx+11], cl
|
||||
mov rbx, QWORD PTR [r10+rdi]
|
||||
mov rcx, r9
|
||||
lea r9, QWORD PTR [r10+rdi]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
movdqa xmm4, xmm3
|
||||
mul rcx
|
||||
movq rcx, xmm6
|
||||
add r8, rdx
|
||||
add r13, rax
|
||||
movq rax, xmm5
|
||||
xor rax, r13
|
||||
mov QWORD PTR [r9], r8
|
||||
xor r8, rbx
|
||||
mov QWORD PTR [r9+8], rax
|
||||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 1048560
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cn_litev1_mainloop_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
movaps xmm9, XMMWORD PTR [rsp+48]
|
||||
|
||||
add rsp, 72
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
|
@ -1,166 +1,171 @@
|
|||
_TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE
|
||||
PUBLIC cnv1_mainloop_sandybridge_asm
|
||||
PUBLIC cn_litev1_mainloop_sandybridge_asm
|
||||
PUBLIC cn_fast_mainloop_sandybridge_asm
|
||||
PUBLIC cnv2_mainloop_ivybridge_asm
|
||||
PUBLIC cnv2_mainloop_ryzen_asm
|
||||
PUBLIC cnv2_mainloop_bulldozer_asm
|
||||
PUBLIC cnv2_double_mainloop_sandybridge_asm
|
||||
PUBLIC cn_fastv2_mainloop_ivybridge_asm
|
||||
PUBLIC cn_fastv2_mainloop_ryzen_asm
|
||||
PUBLIC cn_fastv2_mainloop_bulldozer_asm
|
||||
PUBLIC cn_fastv2_double_mainloop_sandybridge_asm
|
||||
PUBLIC cn_liteupx_mainloop_sandybridge_asm
|
||||
PUBLIC cn_ultralitev2_mainloop_ivybridge_asm
|
||||
PUBLIC cn_ultralitev2_mainloop_ryzen_asm
|
||||
PUBLIC cn_ultralitev2_mainloop_bulldozer_asm
|
||||
PUBLIC cn_ultralitev2_double_mainloop_sandybridge_asm
|
||||
|
||||
PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_litev1_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_fast_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_fastv2_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_liteupx_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cn_ultralitev2_mainloop_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv1_main_loop_sandybridge_asm
|
||||
PUBLIC cnv1_main_loop_lite_sandybridge_asm
|
||||
PUBLIC cnv1_main_loop_fast_sandybridge_asm
|
||||
PUBLIC cnv1_main_loop_upx_sandybridge_asm
|
||||
|
||||
PUBLIC cnv2_main_loop_ivybridge_asm
|
||||
PUBLIC cnv2_main_loop_ryzen_asm
|
||||
PUBLIC cnv2_main_loop_bulldozer_asm
|
||||
PUBLIC cnv2_double_main_loop_sandybridge_asm
|
||||
|
||||
PUBLIC cnv2_main_loop_fastv2_ivybridge_asm
|
||||
PUBLIC cnv2_main_loop_fastv2_ryzen_asm
|
||||
PUBLIC cnv2_main_loop_fastv2_bulldozer_asm
|
||||
PUBLIC cnv2_double_main_loop_fastv2_sandybridge_asm
|
||||
|
||||
PUBLIC cnv2_main_loop_ultralite_ivybridge_asm
|
||||
PUBLIC cnv2_main_loop_ultralite_ryzen_asm
|
||||
PUBLIC cnv2_main_loop_ultralite_bulldozer_asm
|
||||
PUBLIC cnv2_double_main_loop_ultralite_sandybridge_asm
|
||||
|
||||
PUBLIC cnv1_main_loop_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv1_main_loop_lite_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv1_main_loop_fast_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv1_main_loop_upx_soft_aes_sandybridge_asm
|
||||
|
||||
PUBLIC cnv2_main_loop_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv2_main_loop_fastv2_soft_aes_sandybridge_asm
|
||||
PUBLIC cnv2_main_loop_ultralite_soft_aes_sandybridge_asm
|
||||
|
||||
ALIGN 64
|
||||
cnv1_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cnv1_mainloop_sandybridge.inc
|
||||
cnv1_main_loop_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_sandybridge.inc
|
||||
ret 0
|
||||
cnv1_mainloop_sandybridge_asm ENDP
|
||||
cnv1_main_loop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_litev1_mainloop_sandybridge.inc
|
||||
cnv1_main_loop_lite_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_lite_sandybridge.inc
|
||||
ret 0
|
||||
cn_litev1_mainloop_sandybridge_asm ENDP
|
||||
cnv1_main_loop_lite_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_fast_mainloop_sandybridge.inc
|
||||
cnv1_main_loop_fast_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_fast_sandybridge.inc
|
||||
ret 0
|
||||
cn_fast_mainloop_sandybridge_asm ENDP
|
||||
cnv1_main_loop_fast_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_ivybridge_asm PROC
|
||||
cnv1_main_loop_upx_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_upx_sandybridge.inc
|
||||
ret 0
|
||||
cnv1_main_loop_upx_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_main_loop_ivybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_ivybridge.inc
|
||||
ret 0
|
||||
cnv2_mainloop_ivybridge_asm ENDP
|
||||
cnv2_main_loop_ivybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_ryzen_asm PROC
|
||||
cnv2_main_loop_ryzen_asm PROC
|
||||
INCLUDE cnv2_main_loop_ryzen.inc
|
||||
ret 0
|
||||
cnv2_mainloop_ryzen_asm ENDP
|
||||
cnv2_main_loop_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_bulldozer_asm PROC
|
||||
cnv2_main_loop_bulldozer_asm PROC
|
||||
INCLUDE cnv2_main_loop_bulldozer.inc
|
||||
ret 0
|
||||
cnv2_mainloop_bulldozer_asm ENDP
|
||||
cnv2_main_loop_bulldozer_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_double_mainloop_sandybridge_asm PROC
|
||||
cnv2_double_main_loop_sandybridge_asm PROC
|
||||
INCLUDE cnv2_double_main_loop_sandybridge.inc
|
||||
ret 0
|
||||
cnv2_double_mainloop_sandybridge_asm ENDP
|
||||
cnv2_double_main_loop_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fastv2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cn_fastv2_main_loop_ivybridge.inc
|
||||
cnv2_main_loop_fastv2_ivybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_fastv2_ivybridge.inc
|
||||
ret 0
|
||||
cn_fastv2_mainloop_ivybridge_asm ENDP
|
||||
cnv2_main_loop_fastv2_ivybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fastv2_mainloop_ryzen_asm PROC
|
||||
INCLUDE cn_fastv2_main_loop_ryzen.inc
|
||||
cnv2_main_loop_fastv2_ryzen_asm PROC
|
||||
INCLUDE cnv2_main_loop_fastv2_ryzen.inc
|
||||
ret 0
|
||||
cn_fastv2_mainloop_ryzen_asm ENDP
|
||||
cnv2_main_loop_fastv2_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fastv2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cn_fastv2_main_loop_bulldozer.inc
|
||||
cnv2_main_loop_fastv2_bulldozer_asm PROC
|
||||
INCLUDE cnv2_main_loop_fastv2_bulldozer.inc
|
||||
ret 0
|
||||
cn_fastv2_mainloop_bulldozer_asm ENDP
|
||||
cnv2_main_loop_fastv2_bulldozer_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fastv2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_fastv2_double_main_loop_sandybridge.inc
|
||||
cnv2_double_main_loop_fastv2_sandybridge_asm PROC
|
||||
INCLUDE cnv2_double_main_loop_fastv2_sandybridge.inc
|
||||
ret 0
|
||||
cn_fastv2_double_mainloop_sandybridge_asm ENDP
|
||||
cnv2_double_main_loop_fastv2_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_liteupx_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_liteupx_mainloop_sandybridge.inc
|
||||
cnv2_main_loop_ultralite_ivybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_ultralite_ivybridge.inc
|
||||
ret 0
|
||||
cn_liteupx_mainloop_sandybridge_asm ENDP
|
||||
cnv2_main_loop_ultralite_ivybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_ultralitev2_mainloop_ivybridge_asm PROC
|
||||
INCLUDE cn_ultralitev2_main_loop_ivybridge.inc
|
||||
cnv2_main_loop_ultralite_ryzen_asm PROC
|
||||
INCLUDE cnv2_main_loop_ultralite_ryzen.inc
|
||||
ret 0
|
||||
cn_ultralitev2_mainloop_ivybridge_asm ENDP
|
||||
cnv2_main_loop_ultralite_ryzen_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_ultralitev2_mainloop_ryzen_asm PROC
|
||||
INCLUDE cn_ultralitev2_main_loop_ryzen.inc
|
||||
cnv2_main_loop_ultralite_bulldozer_asm PROC
|
||||
INCLUDE cnv2_main_loop_ultralite_bulldozer.inc
|
||||
ret 0
|
||||
cn_ultralitev2_mainloop_ryzen_asm ENDP
|
||||
cnv2_main_loop_ultralite_bulldozer_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_ultralitev2_mainloop_bulldozer_asm PROC
|
||||
INCLUDE cn_ultralitev2_main_loop_bulldozer.inc
|
||||
cnv2_double_main_loop_ultralite_sandybridge_asm PROC
|
||||
INCLUDE cnv2_double_main_loop_ultralite_sandybridge.inc
|
||||
ret 0
|
||||
cn_ultralitev2_mainloop_bulldozer_asm ENDP
|
||||
cnv2_double_main_loop_ultralite_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_ultralitev2_double_mainloop_sandybridge_asm PROC
|
||||
INCLUDE cn_ultralitev2_double_main_loop_sandybridge.inc
|
||||
cnv1_main_loop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_ultralitev2_double_mainloop_sandybridge_asm ENDP
|
||||
cnv1_main_loop_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv1_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc
|
||||
cnv1_main_loop_lite_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_lite_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cnv1_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
cnv1_main_loop_lite_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_litev1_mainloop_soft_aes_sandybridge.inc
|
||||
cnv1_main_loop_fast_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_fast_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_litev1_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
cnv1_main_loop_fast_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fast_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_fast_mainloop_soft_aes_sandybridge.inc
|
||||
cnv1_main_loop_upx_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv1_main_loop_upx_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_fast_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
cnv1_main_loop_upx_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc
|
||||
cnv2_main_loop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cnv2_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
cnv2_main_loop_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_fastv2_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_fastv2_mainloop_soft_aes_sandybridge.inc
|
||||
cnv2_main_loop_fastv2_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_fastv2_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_fastv2_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
cnv2_main_loop_fastv2_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_liteupx_mainloop_soft_aes_sandybridge.inc
|
||||
cnv2_main_loop_ultralite_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cnv2_main_loop_ultralite_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_liteupx_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
ALIGN 64
|
||||
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm PROC
|
||||
INCLUDE cn_ultralitev2_mainloop_soft_aes_sandybridge.inc
|
||||
ret 0
|
||||
cn_ultralitev2_mainloop_soft_aes_sandybridge_asm ENDP
|
||||
cnv2_main_loop_ultralite_soft_aes_sandybridge_asm ENDP
|
||||
|
||||
_TEXT_CN_MAINLOOP ENDS
|
||||
END
|
|
@ -3,142 +3,146 @@
|
|||
# define FN_PREFIX(fn) fn
|
||||
.section .text
|
||||
|
||||
.global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_main_loop_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm)
|
||||
.global FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm)
|
||||
|
||||
.global FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm)
|
||||
.global FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm)
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv1_mainloop_sandybridge_asm):
|
||||
#include "../cnv1_mainloop_sandybridge.inc"
|
||||
FN_PREFIX(cnv1_main_loop_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_litev1_mainloop_sandybridge_asm):
|
||||
#include "../cn_litev1_mainloop_sandybridge.inc"
|
||||
FN_PREFIX(cnv1_main_loop_lite_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_lite_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast_mainloop_sandybridge_asm):
|
||||
#include "../cn_fast_mainloop_sandybridge.inc"
|
||||
FN_PREFIX(cnv1_main_loop_fast_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_fast_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_mainloop_ivybridge_asm):
|
||||
FN_PREFIX(cnv1_main_loop_upx_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_upx_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_main_loop_ivybridge_asm):
|
||||
#include "../cnv2_main_loop_ivybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_mainloop_ryzen_asm):
|
||||
FN_PREFIX(cnv2_main_loop_ryzen_asm):
|
||||
#include "../cnv2_main_loop_ryzen.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_mainloop_bulldozer_asm):
|
||||
FN_PREFIX(cnv2_main_loop_bulldozer_asm):
|
||||
#include "../cnv2_main_loop_bulldozer.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
|
||||
FN_PREFIX(cnv2_double_main_loop_sandybridge_asm):
|
||||
#include "../cnv2_double_main_loop_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fastv2_mainloop_ivybridge_asm):
|
||||
#include "../cn_fastv2_main_loop_ivybridge.inc"
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_ivybridge_asm):
|
||||
#include "../cnv2_main_loop_fastv2_ivybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fastv2_mainloop_ryzen_asm):
|
||||
#include "../cn_fastv2_main_loop_ryzen.inc"
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_ryzen_asm):
|
||||
#include "../cnv2_main_loop_fastv2_ryzen.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fastv2_mainloop_bulldozer_asm):
|
||||
#include "../cn_fastv2_main_loop_bulldozer.inc"
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_bulldozer_asm):
|
||||
#include "../cnv2_main_loop_fastv2_bulldozer.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fastv2_double_mainloop_sandybridge_asm):
|
||||
#include "../cn_fastv2_double_main_loop_sandybridge.inc"
|
||||
FN_PREFIX(cnv2_double_main_loop_fastv2_sandybridge_asm):
|
||||
#include "../cnv2_double_main_loop_fastv2_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_liteupx_mainloop_sandybridge_asm):
|
||||
#include "../cn_liteupx_mainloop_sandybridge.inc"
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_ivybridge_asm):
|
||||
#include "../cnv2_main_loop_ultralite_ivybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_ivybridge_asm):
|
||||
#include "../cn_ultralitev2_main_loop_ivybridge.inc"
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_ryzen_asm):
|
||||
#include "../cnv2_main_loop_ultralite_ryzen.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_ryzen_asm):
|
||||
#include "../cn_ultralitev2_main_loop_ryzen.inc"
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_bulldozer_asm):
|
||||
#include "../cnv2_main_loop_ultralite_bulldozer.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_bulldozer_asm):
|
||||
#include "../cn_ultralitev2_main_loop_bulldozer.inc"
|
||||
FN_PREFIX(cnv2_double_main_loop_ultralite_sandybridge_asm):
|
||||
#include "../cnv2_double_main_loop_ultralite_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_ultralitev2_double_mainloop_sandybridge_asm):
|
||||
#include "../cn_ultralitev2_double_main_loop_sandybridge.inc"
|
||||
FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cnv1_mainloop_soft_aes_sandybridge.inc"
|
||||
FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_lite_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_litev1_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_litev1_mainloop_soft_aes_sandybridge.inc"
|
||||
FN_PREFIX(cnv1_main_loop_fast_soft_aes_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_fast_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fast_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_fast_mainloop_soft_aes_sandybridge.inc"
|
||||
FN_PREFIX(cnv1_main_loop_upx_soft_aes_sandybridge_asm):
|
||||
#include "../cnv1_main_loop_upx_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cnv2_mainloop_soft_aes_sandybridge.inc"
|
||||
FN_PREFIX(cnv2_main_loop_soft_aes_sandybridge_asm):
|
||||
#include "../cnv2_main_loop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_fastv2_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_fastv2_mainloop_soft_aes_sandybridge.inc"
|
||||
FN_PREFIX(cnv2_main_loop_fastv2_soft_aes_sandybridge_asm):
|
||||
#include "../cnv2_main_loop_fastv2_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_liteupx_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_liteupx_mainloop_soft_aes_sandybridge.inc"
|
||||
ret 0
|
||||
|
||||
ALIGN 64
|
||||
FN_PREFIX(cn_ultralitev2_mainloop_soft_aes_sandybridge_asm):
|
||||
#include "../cn_ultralitev2_mainloop_soft_aes_sandybridge.inc"
|
||||
FN_PREFIX(cnv2_main_loop_ultralite_soft_aes_sandybridge_asm):
|
||||
#include "../cnv2_main_loop_ultralite_soft_aes_sandybridge.inc"
|
||||
ret 0
|
|
@ -1,410 +0,0 @@
|
|||
mov rax, rsp
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 184
|
||||
|
||||
stmxcsr DWORD PTR [rsp+272]
|
||||
mov DWORD PTR [rsp+276], 24448
|
||||
ldmxcsr DWORD PTR [rsp+276]
|
||||
|
||||
mov r13, QWORD PTR [rcx+224]
|
||||
mov r9, rdx
|
||||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 65536
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov rdi, QWORD PTR [r9+32]
|
||||
xor rdi, QWORD PTR [r9]
|
||||
mov rbp, QWORD PTR [r9+40]
|
||||
xor rbp, QWORD PTR [r9+8]
|
||||
movq xmm0, rdx
|
||||
movaps XMMWORD PTR [rax-88], xmm6
|
||||
movaps XMMWORD PTR [rax-104], xmm7
|
||||
movaps XMMWORD PTR [rax-120], xmm8
|
||||
movaps XMMWORD PTR [rsp+112], xmm9
|
||||
movaps XMMWORD PTR [rsp+96], xmm10
|
||||
movaps XMMWORD PTR [rsp+80], xmm11
|
||||
movaps XMMWORD PTR [rsp+64], xmm12
|
||||
movaps XMMWORD PTR [rsp+48], xmm13
|
||||
movaps XMMWORD PTR [rsp+32], xmm14
|
||||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 131056
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r8+72]
|
||||
movq xmm5, QWORD PTR [r8+104]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 1
|
||||
shl rax, 52
|
||||
movq xmm14, rax
|
||||
punpcklqdq xmm14, xmm14
|
||||
|
||||
mov eax, 1023
|
||||
shl rax, 52
|
||||
movq xmm12, rax
|
||||
punpcklqdq xmm12, xmm12
|
||||
|
||||
mov rax, QWORD PTR [r8+80]
|
||||
xor rax, QWORD PTR [r8+64]
|
||||
punpcklqdq xmm7, xmm0
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r9+56]
|
||||
xor rcx, QWORD PTR [r9+24]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [r9+48]
|
||||
xor rax, QWORD PTR [r9+16]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp], r13
|
||||
mov rcx, QWORD PTR [r9+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm6, rax
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
punpcklqdq xmm6, xmm0
|
||||
movq xmm0, rcx
|
||||
mov QWORD PTR [rsp+256], r10
|
||||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 131056
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, QWORD PTR [r9+104]
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
movdqu xmm11, XMMWORD PTR [r8]
|
||||
punpcklqdq xmm5, xmm0
|
||||
lea r9, QWORD PTR [rdx+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN 64
|
||||
main_loop_double_ultralitev2_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
xor eax, 16
|
||||
xor ebx, 32
|
||||
xor edx, 48
|
||||
|
||||
movq xmm0, r11
|
||||
movq xmm2, r10
|
||||
punpcklqdq xmm2, xmm0
|
||||
aesenc xmm9, xmm2
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+r13]
|
||||
movdqu xmm1, XMMWORD PTR [rbx+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [rbx+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r13]
|
||||
movdqu XMMWORD PTR [rdx+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [rax+r13], xmm0
|
||||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 131056
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
||||
lea rbx, QWORD PTR [rdx+r13]
|
||||
mov r10, QWORD PTR [rdx+r13]
|
||||
|
||||
movdqu xmm10, xmm11
|
||||
movq xmm0, rbp
|
||||
movq xmm11, rdi
|
||||
punpcklqdq xmm11, xmm0
|
||||
aesenc xmm10, xmm11
|
||||
|
||||
mov eax, ecx
|
||||
mov r12d, ecx
|
||||
xor eax, 16
|
||||
xor r12d, 32
|
||||
xor ecx, 48
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [rax+rsi]
|
||||
paddq xmm0, xmm6
|
||||
movdqu xmm1, XMMWORD PTR [r12+rsi]
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm0
|
||||
paddq xmm1, xmm11
|
||||
movdqu xmm0, XMMWORD PTR [rcx+rsi]
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
paddq xmm0, xmm8
|
||||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 131056
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
movdqu XMMWORD PTR [r8], xmm0
|
||||
mov r12, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov r9, QWORD PTR [rbx+8]
|
||||
|
||||
xor edx, 16
|
||||
mov r8d, edx
|
||||
mov r15d, edx
|
||||
|
||||
movq rdx, xmm5
|
||||
shl rdx, 32
|
||||
movq rax, xmm4
|
||||
xor rdx, rax
|
||||
xor r10, rdx
|
||||
mov rax, r10
|
||||
mul r11
|
||||
mov r11d, r8d
|
||||
xor r11d, 48
|
||||
movq xmm0, rdx
|
||||
xor rdx, [r11+r13]
|
||||
movq xmm1, rax
|
||||
xor rax, [r11+r13+8]
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
pxor xmm0, XMMWORD PTR [r8+r13]
|
||||
xor r8d, 32
|
||||
movdqu xmm1, XMMWORD PTR [r11+r13]
|
||||
paddq xmm0, xmm7
|
||||
paddq xmm1, xmm2
|
||||
movdqu XMMWORD PTR [r11+r13], xmm0
|
||||
movdqu xmm0, XMMWORD PTR [r8+r13]
|
||||
movdqu XMMWORD PTR [r8+r13], xmm1
|
||||
paddq xmm0, xmm3
|
||||
movdqu XMMWORD PTR [r15+r13], xmm0
|
||||
|
||||
mov r11, QWORD PTR [rsp+256]
|
||||
add r11, rdx
|
||||
mov rdx, QWORD PTR [rsp+264]
|
||||
add rdx, rax
|
||||
mov QWORD PTR [rbx], r11
|
||||
xor r11, r10
|
||||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 131056
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
movdqu xmm15, XMMWORD PTR [r11+r13]
|
||||
lea r13, QWORD PTR [rsi+rcx]
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movaps xmm2, xmm13
|
||||
movq r10, xmm0
|
||||
psllq xmm5, 1
|
||||
shl r10, 32
|
||||
movdqa xmm0, xmm9
|
||||
psrldq xmm0, 8
|
||||
movdqa xmm1, xmm10
|
||||
movq r11, xmm0
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
psrldq xmm4, 8
|
||||
movaps xmm0, xmm13
|
||||
movq rax, xmm4
|
||||
xor r10, rax
|
||||
movaps xmm1, xmm13
|
||||
xor r10, r12
|
||||
lea rax, QWORD PTR [r11+1]
|
||||
shr rax, 1
|
||||
movdqa xmm3, xmm9
|
||||
punpcklqdq xmm3, xmm10
|
||||
paddq xmm5, xmm3
|
||||
movq rdx, xmm5
|
||||
psrldq xmm5, 8
|
||||
cvtsi2sd xmm2, rax
|
||||
or edx, -2147483647
|
||||
lea rax, QWORD PTR [r8+1]
|
||||
shr rax, 1
|
||||
movq r9, xmm5
|
||||
cvtsi2sd xmm0, rax
|
||||
or r9d, -2147483647
|
||||
cvtsi2sd xmm1, rdx
|
||||
unpcklpd xmm2, xmm0
|
||||
movaps xmm0, xmm13
|
||||
cvtsi2sd xmm0, r9
|
||||
unpcklpd xmm1, xmm0
|
||||
divpd xmm2, xmm1
|
||||
paddq xmm2, xmm14
|
||||
cvttsd2si rax, xmm2
|
||||
psrldq xmm2, 8
|
||||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_ultralitev2_sandybridge
|
||||
div_fix_1_ret_ultralitev2_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
imul rax, r9
|
||||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_ultralitev2_sandybridge
|
||||
div_fix_2_ret_ultralitev2_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
punpckldq xmm2, xmm1
|
||||
punpckldq xmm4, xmm0
|
||||
punpckldq xmm4, xmm2
|
||||
paddq xmm3, xmm4
|
||||
movdqa xmm0, xmm3
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm12
|
||||
sqrtpd xmm1, xmm0
|
||||
movq r9, xmm1
|
||||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_ultralitev2_sandybridge
|
||||
sqrt_fix_1_ret_ultralitev2_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_ultralitev2_sandybridge
|
||||
sqrt_fix_2_ret_ultralitev2_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
xor r12d, 16
|
||||
xor r8d, 32
|
||||
xor ecx, 48
|
||||
mov rax, r10
|
||||
mul r9
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
punpcklqdq xmm3, xmm0
|
||||
|
||||
movdqu xmm0, XMMWORD PTR [r12+rsi]
|
||||
pxor xmm0, xmm3
|
||||
movdqu xmm1, XMMWORD PTR [r8+rsi]
|
||||
xor rdx, [r8+rsi]
|
||||
xor rax, [r8+rsi+8]
|
||||
movdqu xmm3, XMMWORD PTR [rcx+rsi]
|
||||
paddq xmm0, xmm6
|
||||
paddq xmm1, xmm11
|
||||
paddq xmm3, xmm8
|
||||
movdqu XMMWORD PTR [r8+rsi], xmm0
|
||||
movdqu XMMWORD PTR [rcx+rsi], xmm1
|
||||
movdqu XMMWORD PTR [r12+rsi], xmm3
|
||||
|
||||
add rdi, rdx
|
||||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 131056
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
add rbp, rax
|
||||
mov QWORD PTR [r13+8], rbp
|
||||
movdqu xmm11, XMMWORD PTR [rcx+rsi]
|
||||
xor rbp, rdx
|
||||
mov r13, QWORD PTR [rsp]
|
||||
movdqa xmm3, xmm7
|
||||
mov rdx, QWORD PTR [rsp+8]
|
||||
movdqa xmm8, xmm6
|
||||
mov r10, QWORD PTR [rsp+256]
|
||||
movdqa xmm7, xmm9
|
||||
mov r11, QWORD PTR [rsp+264]
|
||||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_ultralitev2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+184]
|
||||
movaps xmm6, XMMWORD PTR [r11-24]
|
||||
movaps xmm7, XMMWORD PTR [r11-40]
|
||||
movaps xmm8, XMMWORD PTR [r11-56]
|
||||
movaps xmm9, XMMWORD PTR [r11-72]
|
||||
movaps xmm10, XMMWORD PTR [r11-88]
|
||||
movaps xmm11, XMMWORD PTR [r11-104]
|
||||
movaps xmm12, XMMWORD PTR [r11-120]
|
||||
movaps xmm14, XMMWORD PTR [rsp+32]
|
||||
movaps xmm15, XMMWORD PTR [rsp+16]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp
|
||||
|
||||
div_fix_1_ultralitev2_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_ultralitev2_sandybridge
|
||||
|
||||
div_fix_2_ultralitev2_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_ultralitev2_sandybridge
|
||||
|
||||
sqrt_fix_1_ultralitev2_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
dec r9
|
||||
mov r11d, -1022
|
||||
shl r11, 32
|
||||
mov rax, r9
|
||||
shr r9, 19
|
||||
shr rax, 20
|
||||
mov rdx, r9
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+r11+1]
|
||||
add rax, r11
|
||||
imul rdx, rax
|
||||
sub rdx, r8
|
||||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_ultralitev2_sandybridge
|
||||
|
||||
sqrt_fix_2_ultralitev2_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
mov ebx, -1022
|
||||
shl rbx, 32
|
||||
mov rax, r8
|
||||
shr r8, 19
|
||||
shr rax, 20
|
||||
mov rdx, r8
|
||||
sub rdx, rax
|
||||
lea rdx, [rdx+rbx+1]
|
||||
add rax, rbx
|
||||
imul rdx, rax
|
||||
sub rdx, r11
|
||||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_ultralitev2_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_ultralitev2_sandybridge_endp:
|
|
@ -1,180 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 65536
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movd xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movd xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 131056
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movd xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movd xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_ultralitev2_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movd xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
|
||||
mov edi, 1023
|
||||
shl rdi, 52
|
||||
|
||||
movd r14, xmm5
|
||||
pextrq rax, xmm5, 1
|
||||
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 131056
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
div r9
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
lea r15, [rax+rdx]
|
||||
lea rax, [r14+r15]
|
||||
shr rax, 12
|
||||
add rax, rdi
|
||||
movd xmm0, rax
|
||||
sqrtsd xmm1, xmm0
|
||||
movd rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_ultralitev2_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_ultralitev2_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movd xmm1, rax
|
||||
movd xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 131056
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_ultralitev2_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_ultralitev2_bulldozer_endp
|
||||
|
||||
sqrt_fixup_ultralitev2_bulldozer:
|
||||
movd r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_ultralitev2_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_ultralitev2_bulldozer_endp:
|
|
@ -1,182 +0,0 @@
|
|||
mov QWORD PTR [rsp+24], rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 80
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 65536
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm4, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
movq xmm3, QWORD PTR [r9+104]
|
||||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 131056
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm5, xmm0
|
||||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_ultralitev2_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
mov rdi, r15
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
movq xmm0, r11
|
||||
movq xmm7, r8
|
||||
punpcklqdq xmm7, xmm0
|
||||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 131056
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm1, xmm7
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqu XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
mov r10, r9
|
||||
xor r10d, 32
|
||||
movq rcx, xmm3
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
xor rdi, rax
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx], xmm0
|
||||
xor rdi, QWORD PTR [r9+rbx]
|
||||
lea r14, QWORD PTR [r9+rbx]
|
||||
mov r12, QWORD PTR [r14+8]
|
||||
xor edx, edx
|
||||
lea r9d, DWORD PTR [ecx+ecx]
|
||||
add r9d, ebp
|
||||
movdqa xmm0, xmm6
|
||||
psrldq xmm0, 8
|
||||
or r9d, r13d
|
||||
movq rax, xmm0
|
||||
div r9
|
||||
xorps xmm3, xmm3
|
||||
mov eax, eax
|
||||
shl rdx, 32
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rbp]
|
||||
mov r15, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_ultralitev2_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_ultralitev2_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
mul rbp
|
||||
movq xmm2, rdx
|
||||
xor rdx, [rcx+rbx]
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 131056
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
mov QWORD PTR [r14+8], r11
|
||||
punpcklqdq xmm2, xmm0
|
||||
|
||||
mov r9d, r10d
|
||||
xor r9d, 48
|
||||
xor r10d, 16
|
||||
pxor xmm2, XMMWORD PTR [r9+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm0, xmm5
|
||||
movdqu xmm1, XMMWORD PTR [rcx+rbx]
|
||||
paddq xmm2, xmm4
|
||||
paddq xmm1, xmm7
|
||||
movdqa xmm5, xmm4
|
||||
movdqu XMMWORD PTR [r9+rbx], xmm0
|
||||
movdqa xmm4, xmm6
|
||||
movdqu XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqu XMMWORD PTR [r10+rbx], xmm1
|
||||
movdqu xmm6, [rdi+rbx]
|
||||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_ultralitev2_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
movaps xmm6, XMMWORD PTR [rsp+64]
|
||||
movaps xmm7, XMMWORD PTR [rsp+48]
|
||||
movaps xmm8, XMMWORD PTR [rsp+32]
|
||||
add rsp, 80
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_ultralitev2_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_ultralitev2_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
add rax, r13
|
||||
not r13
|
||||
sub rcx, r13
|
||||
mov r13d, -2147483647
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_ultralitev2_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_ultralitev2_ivybridge_endp:
|
|
@ -1,179 +0,0 @@
|
|||
mov QWORD PTR [rsp+16], rbx
|
||||
mov QWORD PTR [rsp+24], rbp
|
||||
mov QWORD PTR [rsp+32], rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 64
|
||||
|
||||
stmxcsr DWORD PTR [rsp]
|
||||
mov DWORD PTR [rsp+4], 24448
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 65536
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
mov r10, r8
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
movq xmm3, rax
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rbx, QWORD PTR [rcx+224]
|
||||
mov rax, QWORD PTR [r9+80]
|
||||
xor rax, QWORD PTR [r9+64]
|
||||
movq xmm0, rdx
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 131056
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+16], xmm8
|
||||
xorps xmm8, xmm8
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm7, rax
|
||||
mov r15, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movq xmm0, rcx
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_ultralitev2_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
punpcklqdq xmm6, xmm0
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
lea r9, QWORD PTR [rdi+rdi]
|
||||
shl rdi, 32
|
||||
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
xor ecx, 16
|
||||
xor eax, 32
|
||||
xor r10d, 48
|
||||
aesenc xmm5, xmm6
|
||||
movdqa xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqa xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqa xmm0, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
paddq xmm0, xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm0
|
||||
movdqa XMMWORD PTR [rax+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movaps xmm1, xmm8
|
||||
mov rsi, r15
|
||||
xor rsi, rdi
|
||||
movq r14, xmm5
|
||||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 131056
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
mov r13, QWORD PTR [r10+rbx+8]
|
||||
|
||||
add r9d, r14d
|
||||
or r9d, -2147483647
|
||||
xor edx, edx
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
|
||||
div r9
|
||||
movq xmm0, rax
|
||||
movq xmm1, rdx
|
||||
punpckldq xmm0, xmm1
|
||||
movq r15, xmm0
|
||||
paddq xmm0, xmm5
|
||||
movdqa xmm2, xmm0
|
||||
psrlq xmm0, 12
|
||||
paddq xmm0, xmm7
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_ultralitev2_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_ultralitev2_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
movq xmm0, rdx
|
||||
punpcklqdq xmm0, xmm1
|
||||
|
||||
mov r9d, r10d
|
||||
mov ecx, r10d
|
||||
xor r9d, 16
|
||||
xor ecx, 32
|
||||
xor r10d, 48
|
||||
movdqa xmm1, XMMWORD PTR [rcx+rbx]
|
||||
xor rdx, [rcx+rbx]
|
||||
xor rax, [rcx+rbx+8]
|
||||
movdqa xmm2, XMMWORD PTR [r9+rbx]
|
||||
pxor xmm2, xmm0
|
||||
paddq xmm4, XMMWORD PTR [r10+rbx]
|
||||
paddq xmm2, xmm3
|
||||
paddq xmm1, xmm6
|
||||
movdqa XMMWORD PTR [r9+rbx], xmm4
|
||||
movdqa XMMWORD PTR [rcx+rbx], xmm2
|
||||
movdqa XMMWORD PTR [r10+rbx], xmm1
|
||||
|
||||
movdqa xmm4, xmm3
|
||||
add r8, rdx
|
||||
add r11, rax
|
||||
mov QWORD PTR [r12], r8
|
||||
xor r8, rsi
|
||||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 131056
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_ultralitev2_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
lea r11, QWORD PTR [rsp+64]
|
||||
mov rbx, QWORD PTR [r11+56]
|
||||
mov rbp, QWORD PTR [r11+64]
|
||||
mov rsi, QWORD PTR [r11+72]
|
||||
movaps xmm8, XMMWORD PTR [r11-48]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
mov rsp, r11
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_ultralitev2_ryzen_endp
|
||||
|
||||
$sqrt_fixup_ultralitev2_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
shl rdx, 32
|
||||
mov rax, rdi
|
||||
shr rdi, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdi
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+rdx+1]
|
||||
add rax, rdx
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_ultralitev2_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_ultralitev2_ryzen_endp:
|
|
@ -1,267 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rcx
|
||||
push rbx
|
||||
push rbp
|
||||
push rsi
|
||||
push rdi
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
sub rsp, 152
|
||||
|
||||
stmxcsr DWORD PTR [rsp+4]
|
||||
mov DWORD PTR [rsp], 24448
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r10, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r9, QWORD PTR [rcx+40]
|
||||
xor r9, QWORD PTR [rcx+8]
|
||||
movq xmm4, rax
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r11, QWORD PTR [rcx+224]
|
||||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r10+72]
|
||||
mov rax, QWORD PTR [r10+80]
|
||||
movq xmm0, rdx
|
||||
xor rax, QWORD PTR [r10+64]
|
||||
|
||||
movaps XMMWORD PTR [rsp+16], xmm6
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
movaps XMMWORD PTR [rsp+48], xmm8
|
||||
movaps XMMWORD PTR [rsp+64], xmm9
|
||||
movaps XMMWORD PTR [rsp+80], xmm10
|
||||
movaps XMMWORD PTR [rsp+96], xmm11
|
||||
movaps XMMWORD PTR [rsp+112], xmm12
|
||||
movaps XMMWORD PTR [rsp+128], xmm13
|
||||
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
shl rax, 52
|
||||
movq xmm8, rax
|
||||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 131056
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
xorps xmm9, xmm9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
movq xmm12, r11
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 65536
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_soft_aes_ultralitev2_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
mov esi, DWORD PTR [r13]
|
||||
movq xmm0, r9
|
||||
mov r10d, DWORD PTR [r13+4]
|
||||
movq xmm7, r8
|
||||
mov ebp, DWORD PTR [r13+12]
|
||||
mov r14d, DWORD PTR [r13+8]
|
||||
mov rdx, QWORD PTR [rsp+248]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
punpcklqdq xmm7, xmm0
|
||||
mov r15d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
mov edi, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov ebx, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
xor r15d, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, r14b
|
||||
shr r14d, 8
|
||||
mov eax, r14d
|
||||
shr eax, 8
|
||||
xor edi, DWORD PTR [r12+rcx*4+1024]
|
||||
add eax, 256
|
||||
movzx ecx, bpl
|
||||
shr ebp, 8
|
||||
xor ebx, DWORD PTR [r12+rcx*4+1024]
|
||||
movzx ecx, sil
|
||||
shr esi, 8
|
||||
xor r9d, DWORD PTR [r12+rcx*4+1024]
|
||||
add r12, 2048
|
||||
movzx ecx, r10b
|
||||
shr r10d, 8
|
||||
add r10d, 256
|
||||
mov r11d, DWORD PTR [r12+rax*4]
|
||||
xor r11d, DWORD PTR [r12+rcx*4]
|
||||
xor r11d, r9d
|
||||
movzx ecx, sil
|
||||
mov r10d, DWORD PTR [r12+r10*4]
|
||||
shr esi, 8
|
||||
add esi, 256
|
||||
xor r10d, DWORD PTR [r12+rcx*4]
|
||||
movzx ecx, bpl
|
||||
xor r10d, ebx
|
||||
shr ebp, 8
|
||||
movd xmm1, r11d
|
||||
add ebp, 256
|
||||
movq r11, xmm12
|
||||
mov r9d, DWORD PTR [r12+rcx*4]
|
||||
xor r9d, DWORD PTR [r12+rsi*4]
|
||||
mov eax, DWORD PTR [r12+rbp*4]
|
||||
xor r9d, edi
|
||||
movzx ecx, r14b
|
||||
movd xmm0, r10d
|
||||
movd xmm2, r9d
|
||||
xor eax, DWORD PTR [r12+rcx*4]
|
||||
mov rcx, rdx
|
||||
xor eax, r15d
|
||||
punpckldq xmm2, xmm1
|
||||
xor rcx, 16
|
||||
movd xmm6, eax
|
||||
mov rax, rdx
|
||||
punpckldq xmm6, xmm0
|
||||
xor rax, 32
|
||||
punpckldq xmm6, xmm2
|
||||
xor rdx, 48
|
||||
movdqu xmm2, XMMWORD PTR [rcx+r11]
|
||||
pxor xmm6, xmm7
|
||||
paddq xmm2, xmm4
|
||||
movdqu xmm1, XMMWORD PTR [rax+r11]
|
||||
movdqu xmm0, XMMWORD PTR [rdx+r11]
|
||||
paddq xmm0, xmm5
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm0
|
||||
movdqu XMMWORD PTR [rax+r11], xmm2
|
||||
movq rcx, xmm13
|
||||
paddq xmm1, xmm7
|
||||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 131056
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
movq rbx, xmm10
|
||||
xor rbx, rax
|
||||
lea r9, QWORD PTR [rcx+rcx]
|
||||
add r9d, edi
|
||||
movdqa xmm0, xmm6
|
||||
pxor xmm0, xmm4
|
||||
mov ecx, -2147483647
|
||||
movdqu XMMWORD PTR [r13], xmm0
|
||||
or r9, rcx
|
||||
movdqa xmm0, xmm6
|
||||
movaps xmm1, xmm9
|
||||
psrldq xmm0, 8
|
||||
movq rax, xmm0
|
||||
xor rbx, QWORD PTR [r10+r11]
|
||||
lea r14, QWORD PTR [r10+r11]
|
||||
mov rbp, QWORD PTR [r14+8]
|
||||
div r9
|
||||
shl rdx, 32
|
||||
mov eax, eax
|
||||
add rdx, rax
|
||||
lea r9, QWORD PTR [rdx+rdi]
|
||||
movq xmm10, rdx
|
||||
mov rax, r9
|
||||
shr rax, 12
|
||||
movq xmm0, rax
|
||||
paddq xmm0, xmm8
|
||||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_ultralitev2_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
xor r9, 16
|
||||
mov rcx, r10
|
||||
xor rcx, 32
|
||||
xor r10, 48
|
||||
mov rax, rbx
|
||||
mul rdi
|
||||
movdqu xmm2, XMMWORD PTR [r9+r11]
|
||||
movdqu xmm1, XMMWORD PTR [rcx+r11]
|
||||
paddq xmm1, xmm7
|
||||
movq xmm0, rax
|
||||
movq xmm3, rdx
|
||||
xor rax, QWORD PTR [r11+rcx+8]
|
||||
xor rdx, QWORD PTR [rcx+r11]
|
||||
punpcklqdq xmm3, xmm0
|
||||
add r8, rdx
|
||||
movdqu xmm0, XMMWORD PTR [r10+r11]
|
||||
pxor xmm2, xmm3
|
||||
paddq xmm0, xmm5
|
||||
paddq xmm2, xmm4
|
||||
movdqu XMMWORD PTR [r9+r11], xmm0
|
||||
movdqa xmm5, xmm4
|
||||
mov r9, QWORD PTR [rsp+240]
|
||||
movdqa xmm4, xmm6
|
||||
add r9, rax
|
||||
movdqu XMMWORD PTR [rcx+r11], xmm2
|
||||
movdqu XMMWORD PTR [r10+r11], xmm1
|
||||
mov r10, QWORD PTR [rsp+224]
|
||||
movd r12d, xmm11
|
||||
mov QWORD PTR [r14], r8
|
||||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 131056
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_ultralitev2_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
movaps xmm7, XMMWORD PTR [rsp+32]
|
||||
movaps xmm8, XMMWORD PTR [rsp+48]
|
||||
movaps xmm9, XMMWORD PTR [rsp+64]
|
||||
movaps xmm10, XMMWORD PTR [rsp+80]
|
||||
movaps xmm11, XMMWORD PTR [rsp+96]
|
||||
movaps xmm12, XMMWORD PTR [rsp+112]
|
||||
movaps xmm13, XMMWORD PTR [rsp+128]
|
||||
|
||||
add rsp, 152
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp
|
||||
|
||||
sqrt_fixup_soft_aes_ultralitev2_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
mov rax, rdx
|
||||
shr rdx, 19
|
||||
shr rax, 20
|
||||
mov rcx, rdx
|
||||
sub rcx, rax
|
||||
lea rcx, [rcx+r15+1]
|
||||
add rax, r15
|
||||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_ultralitev2_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_ultralitev2_sandybridge_asm_endp:
|
|
@ -5,7 +5,7 @@
|
|||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 262144
|
||||
mov ebp, ${ITERATIONS}
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
|
@ -18,7 +18,7 @@
|
|||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 1048560
|
||||
and edx, ${MASK}
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
|
@ -26,14 +26,14 @@
|
|||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
ALIGN 64
|
||||
cn_litev1_mainloop_sandybridge:
|
||||
cnv1_main_loop_${ALGO}_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 1048560
|
||||
and r9d, ${MASK}
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
|
@ -56,11 +56,11 @@ cn_litev1_mainloop_sandybridge:
|
|||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 1048560
|
||||
and edx, ${MASK}
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cn_litev1_mainloop_sandybridge
|
||||
jne cnv1_main_loop_${ALGO}_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
|
@ -26,7 +26,7 @@
|
|||
xor r13, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov rdi, QWORD PTR [rcx+224]
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
mov rax, QWORD PTR [rax+35]
|
||||
xor rax, QWORD PTR [rcx+192]
|
||||
movq xmm5, rax
|
||||
|
@ -38,10 +38,10 @@
|
|||
mov rax, QWORD PTR [rcx+264]
|
||||
movq xmm7, rax
|
||||
|
||||
mov eax, 524288
|
||||
mov eax, ${ITERATIONS}
|
||||
|
||||
ALIGN 64
|
||||
cnv1_mainloop_soft_aes_sandybridge:
|
||||
cnv1_main_loop_${ALGO}_soft_aes_sandybridge:
|
||||
movq xmm9, rax
|
||||
mov r12, QWORD PTR [rcx+272]
|
||||
mov esi, DWORD PTR [rdx+rdi]
|
||||
|
@ -114,7 +114,7 @@ cnv1_mainloop_soft_aes_sandybridge:
|
|||
pxor xmm3, xmm1
|
||||
movq r9, xmm3
|
||||
mov r10d, r9d
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa xmm0, xmm3
|
||||
pxor xmm0, xmm4
|
||||
movdqu XMMWORD PTR [rdx+rdi], xmm0
|
||||
|
@ -141,10 +141,10 @@ cnv1_mainloop_soft_aes_sandybridge:
|
|||
movq rax, xmm9
|
||||
mov rdx, r8
|
||||
xor r13, r11
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
mov QWORD PTR [rsp+64], rdx
|
||||
sub eax, 1
|
||||
jne cnv1_mainloop_soft_aes_sandybridge
|
||||
jne cnv1_main_loop_${ALGO}_soft_aes_sandybridge
|
||||
|
||||
movaps xmm6, XMMWORD PTR [rsp]
|
||||
movaps xmm7, XMMWORD PTR [rsp+16]
|
|
@ -1,70 +0,0 @@
|
|||
mov QWORD PTR [rsp+8], rbx
|
||||
mov QWORD PTR [rsp+16], rbp
|
||||
mov QWORD PTR [rsp+24], rsi
|
||||
mov QWORD PTR [rsp+32], rdi
|
||||
push r14
|
||||
push r15
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
mov ebp, 524288
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov rdx, QWORD PTR [rcx+56]
|
||||
xor rdx, QWORD PTR [rcx+24]
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
movq xmm3, rax
|
||||
mov rax, QWORD PTR [rcx+256]
|
||||
mov rdi, QWORD PTR [rcx+40]
|
||||
movq xmm0, rdx
|
||||
xor rdi, QWORD PTR [rcx+8]
|
||||
mov rdx, r8
|
||||
mov r15, QWORD PTR [rcx+264]
|
||||
and edx, 2097136
|
||||
mov r14, QWORD PTR [rax+35]
|
||||
xor r14, QWORD PTR [rcx+192]
|
||||
mov rsi, QWORD PTR [rcx+224]
|
||||
punpcklqdq xmm3, xmm0
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
|
||||
ALIGN 64
|
||||
cnv1_mainloop_sandybridge:
|
||||
movq xmm0, rdi
|
||||
movq xmm1, r8
|
||||
punpcklqdq xmm1, xmm0
|
||||
aesenc xmm2, xmm1
|
||||
movq r10, xmm2
|
||||
mov r9d, r10d
|
||||
and r9d, 2097136
|
||||
add r9, rsi
|
||||
movdqa xmm0, xmm2
|
||||
pxor xmm0, xmm3
|
||||
movdqa xmm3, xmm2
|
||||
movdqu XMMWORD PTR [rdx+rsi], xmm0
|
||||
psrldq xmm0, 11
|
||||
movq rax, xmm0
|
||||
movzx eax, al
|
||||
movzx eax, BYTE PTR [rax+r15]
|
||||
mov BYTE PTR [rsi+rdx+11], al
|
||||
mov rbx, QWORD PTR [r9]
|
||||
mov r11, QWORD PTR [r9+8]
|
||||
mov rax, rbx
|
||||
mul r10
|
||||
add r8, rdx
|
||||
mov QWORD PTR [r9], r8
|
||||
add rdi, rax
|
||||
mov rax, r14
|
||||
xor rax, rdi
|
||||
mov QWORD PTR [r9+8], rax
|
||||
xor r8, rbx
|
||||
mov rdx, r8
|
||||
and edx, 2097136
|
||||
movdqu xmm2, XMMWORD PTR [rdx+rsi]
|
||||
xor rdi, r11
|
||||
dec ebp
|
||||
jne cnv1_mainloop_sandybridge
|
||||
|
||||
mov rbx, QWORD PTR [rsp+24]
|
||||
mov rbp, QWORD PTR [rsp+32]
|
||||
mov rsi, QWORD PTR [rsp+40]
|
||||
mov rdi, QWORD PTR [rsp+48]
|
||||
pop r15
|
||||
pop r14
|
|
@ -18,7 +18,7 @@
|
|||
mov r10, QWORD PTR [rcx+32]
|
||||
mov r8, rcx
|
||||
xor r10, QWORD PTR [rcx]
|
||||
mov r14d, 524288
|
||||
mov r14d, ${ITERATIONS}
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
xor r11, QWORD PTR [rcx+8]
|
||||
mov rsi, QWORD PTR [rdx+224]
|
||||
|
@ -41,7 +41,7 @@
|
|||
movaps XMMWORD PTR [rsp+16], xmm15
|
||||
mov rdx, r10
|
||||
movq xmm4, QWORD PTR [r8+96]
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
mov rax, QWORD PTR [rcx+48]
|
||||
xorps xmm13, xmm13
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
|
@ -83,7 +83,7 @@
|
|||
mov rcx, rdi
|
||||
mov QWORD PTR [rsp+264], r11
|
||||
movq xmm8, rax
|
||||
and ecx, 2097136
|
||||
and ecx, ${MASK}
|
||||
punpcklqdq xmm8, xmm0
|
||||
movq xmm0, QWORD PTR [r9+96]
|
||||
punpcklqdq xmm4, xmm0
|
||||
|
@ -95,7 +95,7 @@
|
|||
movdqu xmm15, XMMWORD PTR [r9]
|
||||
|
||||
ALIGN 64
|
||||
main_loop_double_sandybridge:
|
||||
cnv2_double_main_loop_${ALGO}_sandybridge:
|
||||
movdqu xmm9, xmm15
|
||||
mov eax, edx
|
||||
mov ebx, edx
|
||||
|
@ -120,7 +120,7 @@ main_loop_double_sandybridge:
|
|||
|
||||
movq r11, xmm9
|
||||
mov edx, r11d
|
||||
and edx, 2097136
|
||||
and edx, ${MASK}
|
||||
movdqa xmm0, xmm9
|
||||
pxor xmm0, xmm7
|
||||
movdqu XMMWORD PTR [r9], xmm0
|
||||
|
@ -151,7 +151,7 @@ main_loop_double_sandybridge:
|
|||
movdqu XMMWORD PTR [rax+rsi], xmm0
|
||||
|
||||
movq rcx, xmm10
|
||||
and ecx, 2097136
|
||||
and ecx, ${MASK}
|
||||
|
||||
movdqa xmm0, xmm10
|
||||
pxor xmm0, xmm6
|
||||
|
@ -199,7 +199,7 @@ main_loop_double_sandybridge:
|
|||
mov QWORD PTR [rbx+8], rdx
|
||||
xor rdx, r9
|
||||
mov QWORD PTR [rsp+256], r11
|
||||
and r11d, 2097136
|
||||
and r11d, ${MASK}
|
||||
mov QWORD PTR [rsp+264], rdx
|
||||
mov QWORD PTR [rsp+8], r11
|
||||
lea r15, QWORD PTR [r11+r13]
|
||||
|
@ -249,8 +249,8 @@ main_loop_double_sandybridge:
|
|||
mov rbx, rax
|
||||
imul rax, rdx
|
||||
sub r11, rax
|
||||
js div_fix_1_sandybridge
|
||||
div_fix_1_ret_sandybridge:
|
||||
js div_fix_1_${ALGO}_sandybridge
|
||||
div_fix_1_ret_${ALGO}_sandybridge:
|
||||
|
||||
cvttsd2si rdx, xmm2
|
||||
mov rax, rdx
|
||||
|
@ -258,8 +258,8 @@ div_fix_1_ret_sandybridge:
|
|||
movd xmm2, r11d
|
||||
movd xmm4, ebx
|
||||
sub r8, rax
|
||||
js div_fix_2_sandybridge
|
||||
div_fix_2_ret_sandybridge:
|
||||
js div_fix_2_${ALGO}_sandybridge
|
||||
div_fix_2_ret_${ALGO}_sandybridge:
|
||||
|
||||
movd xmm1, r8d
|
||||
movd xmm0, edx
|
||||
|
@ -275,15 +275,15 @@ div_fix_2_ret_sandybridge:
|
|||
movdqa xmm5, xmm1
|
||||
psrlq xmm5, 19
|
||||
test r9, 524287
|
||||
je sqrt_fix_1_sandybridge
|
||||
sqrt_fix_1_ret_sandybridge:
|
||||
je sqrt_fix_1_${ALGO}_sandybridge
|
||||
sqrt_fix_1_ret_${ALGO}_sandybridge:
|
||||
|
||||
movq r9, xmm10
|
||||
psrldq xmm1, 8
|
||||
movq r8, xmm1
|
||||
test r8, 524287
|
||||
je sqrt_fix_2_sandybridge
|
||||
sqrt_fix_2_ret_sandybridge:
|
||||
je sqrt_fix_2_${ALGO}_sandybridge
|
||||
sqrt_fix_2_ret_${ALGO}_sandybridge:
|
||||
|
||||
mov r12d, ecx
|
||||
mov r8d, ecx
|
||||
|
@ -313,7 +313,7 @@ sqrt_fix_2_ret_sandybridge:
|
|||
mov QWORD PTR [r13], rdi
|
||||
xor rdi, r10
|
||||
mov ecx, edi
|
||||
and ecx, 2097136
|
||||
and ecx, ${MASK}
|
||||
lea r8, QWORD PTR [rcx+rsi]
|
||||
|
||||
mov rdx, QWORD PTR [r13+8]
|
||||
|
@ -331,7 +331,7 @@ sqrt_fix_2_ret_sandybridge:
|
|||
movdqa xmm6, xmm10
|
||||
mov r9, r15
|
||||
dec r14d
|
||||
jne main_loop_double_sandybridge
|
||||
jne cnv2_double_main_loop_${ALGO}_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+272]
|
||||
movaps xmm13, XMMWORD PTR [rsp+48]
|
||||
|
@ -354,19 +354,19 @@ sqrt_fix_2_ret_sandybridge:
|
|||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_double_mainloop_asm_sandybridge_endp
|
||||
jmp cnv2_double_main_loop_${ALGO}_sandybridge_endp
|
||||
|
||||
div_fix_1_sandybridge:
|
||||
div_fix_1_${ALGO}_sandybridge:
|
||||
dec rbx
|
||||
add r11, rdx
|
||||
jmp div_fix_1_ret_sandybridge
|
||||
jmp div_fix_1_ret_${ALGO}_sandybridge
|
||||
|
||||
div_fix_2_sandybridge:
|
||||
div_fix_2_${ALGO}_sandybridge:
|
||||
dec rdx
|
||||
add r8, r9
|
||||
jmp div_fix_2_ret_sandybridge
|
||||
jmp div_fix_2_ret_${ALGO}_sandybridge
|
||||
|
||||
sqrt_fix_1_sandybridge:
|
||||
sqrt_fix_1_${ALGO}_sandybridge:
|
||||
movq r8, xmm3
|
||||
movdqa xmm0, xmm5
|
||||
psrldq xmm0, 8
|
||||
|
@ -385,9 +385,9 @@ sqrt_fix_1_sandybridge:
|
|||
adc r9, 0
|
||||
movq xmm5, r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_1_ret_sandybridge
|
||||
jmp sqrt_fix_1_ret_${ALGO}_sandybridge
|
||||
|
||||
sqrt_fix_2_sandybridge:
|
||||
sqrt_fix_2_${ALGO}_sandybridge:
|
||||
psrldq xmm3, 8
|
||||
movq r11, xmm3
|
||||
dec r8
|
||||
|
@ -405,6 +405,6 @@ sqrt_fix_2_sandybridge:
|
|||
adc r8, 0
|
||||
movq xmm0, r8
|
||||
punpcklqdq xmm5, xmm0
|
||||
jmp sqrt_fix_2_ret_sandybridge
|
||||
jmp sqrt_fix_2_ret_${ALGO}_sandybridge
|
||||
|
||||
cnv2_double_mainloop_asm_sandybridge_endp:
|
||||
cnv2_double_main_loop_${ALGO}_sandybridge_endp:
|
|
@ -15,7 +15,7 @@
|
|||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov ebp, ${ITERATIONS}
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
|
@ -31,7 +31,7 @@
|
|||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movd xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
|
@ -46,7 +46,7 @@
|
|||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 16
|
||||
cnv2_main_loop_bulldozer:
|
||||
cnv2_main_loop_${ALGO}_bulldozer:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movd xmm6, r8
|
||||
pinsrq xmm6, r11, 1
|
||||
|
@ -83,7 +83,7 @@ cnv2_main_loop_bulldozer:
|
|||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
|
@ -103,10 +103,10 @@ cnv2_main_loop_bulldozer:
|
|||
sqrtsd xmm1, xmm0
|
||||
movd rdi, xmm1
|
||||
test rdi, 524287
|
||||
je sqrt_fixup_bulldozer
|
||||
je sqrt_fixup_${ALGO}_bulldozer
|
||||
shr rdi, 19
|
||||
|
||||
sqrt_fixup_bulldozer_ret:
|
||||
sqrt_fixup_${ALGO}_bulldozer_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movd xmm1, rax
|
||||
|
@ -138,10 +138,10 @@ sqrt_fixup_bulldozer_ret:
|
|||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne cnv2_main_loop_bulldozer
|
||||
jne cnv2_main_loop_${ALGO}_bulldozer
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
|
@ -157,9 +157,9 @@ sqrt_fixup_bulldozer_ret:
|
|||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp cnv2_main_loop_bulldozer_endp
|
||||
jmp cnv2_main_loop_${ALGO}_bulldozer_endp
|
||||
|
||||
sqrt_fixup_bulldozer:
|
||||
sqrt_fixup_${ALGO}_bulldozer:
|
||||
movd r9, xmm5
|
||||
add r9, r15
|
||||
dec rdi
|
||||
|
@ -175,6 +175,6 @@ sqrt_fixup_bulldozer:
|
|||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp sqrt_fixup_bulldozer_ret
|
||||
jmp sqrt_fixup_${ALGO}_bulldozer_ret
|
||||
|
||||
cnv2_main_loop_bulldozer_endp:
|
||||
cnv2_main_loop_${ALGO}_bulldozer_endp:
|
|
@ -15,7 +15,7 @@
|
|||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov esi, 524288
|
||||
mov esi, ${ITERATIONS}
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
mov r13d, -2147483647
|
||||
xor r8, QWORD PTR [rcx]
|
||||
|
@ -35,7 +35,7 @@
|
|||
movaps XMMWORD PTR [rsp+64], xmm6
|
||||
movaps XMMWORD PTR [rsp+48], xmm7
|
||||
movaps XMMWORD PTR [rsp+32], xmm8
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movq xmm5, rax
|
||||
|
||||
mov ax, 1023
|
||||
|
@ -48,7 +48,7 @@
|
|||
movdqu xmm6, XMMWORD PTR [r10+rbx]
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_ivybridge:
|
||||
cnv2_main_loop_${ALGO}_ivybridge:
|
||||
lea rdx, QWORD PTR [r10+rbx]
|
||||
mov ecx, r10d
|
||||
mov eax, r10d
|
||||
|
@ -62,7 +62,7 @@ $main_loop_ivybridge:
|
|||
aesenc xmm6, xmm7
|
||||
movq rbp, xmm6
|
||||
mov r9, rbp
|
||||
and r9d, 2097136
|
||||
and r9d, ${MASK}
|
||||
movdqu xmm2, XMMWORD PTR [rcx+rbx]
|
||||
movdqu xmm1, XMMWORD PTR [rax+rbx]
|
||||
movdqu xmm0, XMMWORD PTR [r10+rbx]
|
||||
|
@ -105,9 +105,9 @@ $main_loop_ivybridge:
|
|||
sqrtsd xmm3, xmm0
|
||||
movq rdx, xmm3
|
||||
test edx, 524287
|
||||
je $sqrt_fixup_ivybridge
|
||||
je sqrt_fixup_${ALGO}_ivybridge
|
||||
psrlq xmm3, 19
|
||||
$sqrt_fixup_ivybridge_ret:
|
||||
sqrt_fixup_${ALGO}_ivybridge_ret:
|
||||
|
||||
mov ecx, r10d
|
||||
mov rax, rdi
|
||||
|
@ -118,7 +118,7 @@ $sqrt_fixup_ivybridge_ret:
|
|||
mov QWORD PTR [r14], r8
|
||||
xor r8, rdi
|
||||
mov edi, r8d
|
||||
and edi, 2097136
|
||||
and edi, ${MASK}
|
||||
movq xmm0, rax
|
||||
xor rax, [rcx+rbx+8]
|
||||
add r11, rax
|
||||
|
@ -143,7 +143,7 @@ $sqrt_fixup_ivybridge_ret:
|
|||
mov r10d, edi
|
||||
xor r11, r12
|
||||
dec rsi
|
||||
jne $main_loop_ivybridge
|
||||
jne cnv2_main_loop_${ALGO}_ivybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
mov rbx, QWORD PTR [rsp+160]
|
||||
|
@ -158,9 +158,9 @@ $sqrt_fixup_ivybridge_ret:
|
|||
pop rdi
|
||||
pop rsi
|
||||
pop rbp
|
||||
jmp $cnv2_main_loop_ivybridge_endp
|
||||
jmp cnv2_main_loop_${ALGO}_ivybridge_endp
|
||||
|
||||
$sqrt_fixup_ivybridge:
|
||||
sqrt_fixup_${ALGO}_ivybridge:
|
||||
dec rdx
|
||||
mov r13d, -1022
|
||||
shl r13, 32
|
||||
|
@ -177,6 +177,6 @@ $sqrt_fixup_ivybridge:
|
|||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm3, rdx
|
||||
jmp $sqrt_fixup_ivybridge_ret
|
||||
jmp sqrt_fixup_${ALGO}_ivybridge_ret
|
||||
|
||||
$cnv2_main_loop_ivybridge_endp:
|
||||
cnv2_main_loop_${ALGO}_ivybridge_endp:
|
|
@ -15,7 +15,7 @@
|
|||
mov rax, QWORD PTR [rcx+48]
|
||||
mov r9, rcx
|
||||
xor rax, QWORD PTR [rcx+16]
|
||||
mov ebp, 524288
|
||||
mov ebp, ${ITERATIONS}
|
||||
mov r8, QWORD PTR [rcx+32]
|
||||
xor r8, QWORD PTR [rcx]
|
||||
mov r11, QWORD PTR [rcx+40]
|
||||
|
@ -31,7 +31,7 @@
|
|||
mov rcx, QWORD PTR [rcx+88]
|
||||
xor rcx, QWORD PTR [r9+72]
|
||||
mov rdi, QWORD PTR [r9+104]
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movaps XMMWORD PTR [rsp+48], xmm6
|
||||
movq xmm4, rax
|
||||
movaps XMMWORD PTR [rsp+32], xmm7
|
||||
|
@ -46,7 +46,7 @@
|
|||
punpcklqdq xmm4, xmm0
|
||||
|
||||
ALIGN 64
|
||||
$main_loop_ryzen:
|
||||
cnv2_main_loop_${ALGO}_ryzen:
|
||||
movdqa xmm5, XMMWORD PTR [r10+rbx]
|
||||
movq xmm0, r11
|
||||
movq xmm6, r8
|
||||
|
@ -78,7 +78,7 @@ $main_loop_ryzen:
|
|||
movdqa xmm0, xmm5
|
||||
pxor xmm0, xmm3
|
||||
mov r10, r14
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa XMMWORD PTR [rdx], xmm0
|
||||
xor rsi, QWORD PTR [r10+rbx]
|
||||
lea r12, QWORD PTR [r10+rbx]
|
||||
|
@ -103,10 +103,10 @@ $main_loop_ryzen:
|
|||
sqrtsd xmm1, xmm0
|
||||
movq rdi, xmm1
|
||||
test rdi, 524287
|
||||
je $sqrt_fixup_ryzen
|
||||
je sqrt_fixup_${ALGO}_ryzen
|
||||
shr rdi, 19
|
||||
|
||||
$sqrt_fixup_ryzen_ret:
|
||||
sqrt_fixup_${ALGO}_ryzen_ret:
|
||||
mov rax, rsi
|
||||
mul r14
|
||||
movq xmm1, rax
|
||||
|
@ -138,10 +138,10 @@ $sqrt_fixup_ryzen_ret:
|
|||
mov QWORD PTR [r12+8], r11
|
||||
mov r10, r8
|
||||
xor r11, r13
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
movdqa xmm3, xmm5
|
||||
dec ebp
|
||||
jne $main_loop_ryzen
|
||||
jne cnv2_main_loop_${ALGO}_ryzen
|
||||
|
||||
ldmxcsr DWORD PTR [rsp]
|
||||
movaps xmm6, XMMWORD PTR [rsp+48]
|
||||
|
@ -157,9 +157,9 @@ $sqrt_fixup_ryzen_ret:
|
|||
pop r13
|
||||
pop r12
|
||||
pop rdi
|
||||
jmp $cnv2_main_loop_ryzen_endp
|
||||
jmp cnv2_main_loop_${ALGO}_ryzen_endp
|
||||
|
||||
$sqrt_fixup_ryzen:
|
||||
sqrt_fixup_${ALGO}_ryzen:
|
||||
movq r9, xmm2
|
||||
dec rdi
|
||||
mov edx, -1022
|
||||
|
@ -174,6 +174,6 @@ $sqrt_fixup_ryzen:
|
|||
imul rcx, rax
|
||||
sub rcx, r9
|
||||
adc rdi, 0
|
||||
jmp $sqrt_fixup_ryzen_ret
|
||||
jmp sqrt_fixup_${ALGO}_ryzen_ret
|
||||
|
||||
$cnv2_main_loop_ryzen_endp:
|
||||
cnv2_main_loop_${ALGO}_ryzen_endp:
|
|
@ -47,7 +47,7 @@
|
|||
|
||||
mov rax, r8
|
||||
punpcklqdq xmm4, xmm0
|
||||
and eax, 2097136
|
||||
and eax, ${MASK}
|
||||
movq xmm10, QWORD PTR [r10+96]
|
||||
movq xmm0, rcx
|
||||
mov rcx, QWORD PTR [r10+104]
|
||||
|
@ -57,10 +57,10 @@
|
|||
mov QWORD PTR [rsp+240], r9
|
||||
punpcklqdq xmm5, xmm0
|
||||
movq xmm13, rcx
|
||||
mov r12d, 524288
|
||||
mov r12d, ${ITERATIONS}
|
||||
|
||||
ALIGN 64
|
||||
cnv2_mainloop_soft_aes_sandybridge:
|
||||
cnv2_main_loop_${ALGO}_soft_aes_sandybridge:
|
||||
movd xmm11, r12d
|
||||
mov r12, QWORD PTR [r10+272]
|
||||
lea r13, QWORD PTR [rax+r11]
|
||||
|
@ -148,7 +148,7 @@ cnv2_mainloop_soft_aes_sandybridge:
|
|||
movdqu XMMWORD PTR [rdx+r11], xmm1
|
||||
movq rdi, xmm6
|
||||
mov r10, rdi
|
||||
and r10d, 2097136
|
||||
and r10d, ${MASK}
|
||||
xor edx, edx
|
||||
mov rax, rcx
|
||||
shl rax, 32
|
||||
|
@ -181,9 +181,9 @@ cnv2_mainloop_soft_aes_sandybridge:
|
|||
sqrtsd xmm1, xmm0
|
||||
movq rdx, xmm1
|
||||
test rdx, 524287
|
||||
je sqrt_fixup_soft_aes_sandybridge
|
||||
je sqrt_fixup_${ALGO}_soft_aes_sandybridge
|
||||
psrlq xmm1, 19
|
||||
sqrt_fixup_soft_aes_sandybridge_ret:
|
||||
sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret:
|
||||
|
||||
mov r9, r10
|
||||
movdqa xmm13, xmm1
|
||||
|
@ -219,12 +219,12 @@ sqrt_fixup_soft_aes_sandybridge_ret:
|
|||
xor r8, rbx
|
||||
mov rax, r8
|
||||
mov QWORD PTR [r14+8], r9
|
||||
and eax, 2097136
|
||||
and eax, ${MASK}
|
||||
xor r9, rbp
|
||||
mov QWORD PTR [rsp+240], r9
|
||||
mov QWORD PTR [rsp+248], rax
|
||||
sub r12d, 1
|
||||
jne cnv2_mainloop_soft_aes_sandybridge
|
||||
jne cnv2_main_loop_${ALGO}_soft_aes_sandybridge
|
||||
|
||||
ldmxcsr DWORD PTR [rsp+4]
|
||||
movaps xmm6, XMMWORD PTR [rsp+16]
|
||||
|
@ -245,9 +245,9 @@ sqrt_fixup_soft_aes_sandybridge_ret:
|
|||
pop rsi
|
||||
pop rbp
|
||||
pop rbx
|
||||
jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp
|
||||
jmp cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp
|
||||
|
||||
sqrt_fixup_soft_aes_sandybridge:
|
||||
sqrt_fixup_${ALGO}_soft_aes_sandybridge:
|
||||
dec rdx
|
||||
mov r15d, -1022
|
||||
shl r15, 32
|
||||
|
@ -262,6 +262,6 @@ sqrt_fixup_soft_aes_sandybridge:
|
|||
sub rcx, r9
|
||||
adc rdx, 0
|
||||
movq xmm1, rdx
|
||||
jmp sqrt_fixup_soft_aes_sandybridge_ret
|
||||
jmp sqrt_fixup_${ALGO}_soft_aes_sandybridge_ret
|
||||
|
||||
cnv2_mainloop_soft_aes_sandybridge_asm_endp:
|
||||
cnv2_main_loop_${ALGO}_soft_aes_sandybridge_endp:
|
|
@ -52,6 +52,7 @@ int64_t Client::m_sequence = 1;
|
|||
Client::Client(int id, const char *agent, IClientListener *listener) :
|
||||
m_quiet(false),
|
||||
m_nicehash(false),
|
||||
m_donate(false),
|
||||
m_agent(agent),
|
||||
m_listener(listener),
|
||||
m_id(id),
|
||||
|
@ -231,7 +232,7 @@ bool Client::parseJob(const rapidjson::Value ¶ms, int *code)
|
|||
|
||||
PowVariant powVariant = Options::i()->powVariant();
|
||||
|
||||
if (!Options::i()->forcePowVariant()) {
|
||||
if (!Options::i()->forcePowVariant() || m_donate) {
|
||||
if (params.HasMember("algo")) {
|
||||
std::string algo = params["algo"].GetString();
|
||||
|
||||
|
|
|
@ -67,6 +67,7 @@ public:
|
|||
inline int id() const { return m_id; }
|
||||
inline uint16_t port() const { return m_url.port(); }
|
||||
inline void setQuiet(bool quiet) { m_quiet = quiet; }
|
||||
inline void setDonate(bool donate) { m_donate = donate; }
|
||||
inline void setRetryPause(int ms) { m_retryPause = ms; }
|
||||
|
||||
static void onConnected(uv_async_t *handle);
|
||||
|
@ -99,6 +100,7 @@ private:
|
|||
|
||||
bool m_quiet;
|
||||
bool m_nicehash;
|
||||
bool m_donate;
|
||||
char m_buf[2048];
|
||||
char m_rpcId[64];
|
||||
char m_sendBuf[768];
|
||||
|
|
|
@ -150,9 +150,9 @@ PowVariant Job::powVariant() const
|
|||
} else {
|
||||
return PowVariant::POW_V0;
|
||||
}
|
||||
} else if (m_powVariant == PowVariant::POW_XTL) {
|
||||
if (m_blob[0] > 5) {
|
||||
return PowVariant::POW_XTL_V9;
|
||||
} else if (m_powVariant == PowVariant::POW_MSR) {
|
||||
if (m_blob[0] > 8) {
|
||||
return PowVariant::POW_FAST_2;
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -67,7 +67,7 @@ public:
|
|||
bool operator!=(const Job &other) const;
|
||||
|
||||
private:
|
||||
uint8_t m_blob[96]; // Max blob size is 84 (75 fixed + 9 variable), aligned to 96. https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk.
|
||||
uint8_t m_blob[MAX_BLOB_SIZE]; // Max blob size is 84 (75 fixed + 9 variable), aligned to 96. https://github.com/xmrig/xmrig/issues/1 Thanks fireice-uk.
|
||||
|
||||
bool m_nicehash;
|
||||
int m_poolId;
|
||||
|
|
|
@ -81,6 +81,7 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) :
|
|||
m_client->setUrl(url);
|
||||
m_client->setRetryPause(Options::i()->retryPause() * 1000);
|
||||
m_client->setQuiet(true);
|
||||
m_client->setDonate(true);
|
||||
|
||||
delete url;
|
||||
|
||||
|
|
|
@ -63,7 +63,7 @@ public:
|
|||
State(size_t hashMultiplier)
|
||||
{
|
||||
nonces = new uint32_t[hashMultiplier];
|
||||
blob = new uint8_t[84 * hashMultiplier];
|
||||
blob = new uint8_t[MAX_BLOB_SIZE * hashMultiplier];
|
||||
|
||||
for(size_t i=0; i<hashMultiplier; ++i) {
|
||||
nonces[i] = 0;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue