Integrated CN-UPX2/extremelite (#247)

* WIP

* Added win/asm for upx2

* Added donation servers and fixed windows ASM variant

* #1.9.2 preparation
This commit is contained in:
Ben Gräf 2019-04-24 22:34:30 +02:00 committed by GitHub
parent 1b0557d3b4
commit ff4058a2a9
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
22 changed files with 349 additions and 173 deletions

View file

@ -1,3 +1,6 @@
# 1.9.2
- Integrated cn-extremelite a.k.a upx2 (algo: "cryptonight-extremelite", variant: "upx2")
- Integrated merged templates and replace of @WORKER-ID@ in template assignment
# 1.9.1 # 1.9.1
- Fix coloring of outdated miners on Dashboard - Fix coloring of outdated miners on Dashboard
- Autodetect for fork of CN/R(variant: "auto"), Graft(variant: "rwz"), Zelerius(variant: "zls") - Autodetect for fork of CN/R(variant: "auto"), Graft(variant: "rwz"), Zelerius(variant: "zls")

View file

@ -131,6 +131,29 @@ configure_file("src/crypto/asm/win/cnv2_main_loop_ryzen.inc.in" "src/crypto/asm/
configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc") configure_file("src/crypto/asm/win/cnv2_double_main_loop_sandybridge.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_ultralite_sandybridge.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc") configure_file("src/crypto/asm/win/cnv2_main_loop_soft_aes_sandybridge.inc.in" "src/crypto/asm/win/cnv2_main_loop_ultralite_soft_aes_sandybridge.inc")
# CN V2 RWZ
set(ALGO "original")
set(ITERATIONS "393216") #0x60000
set(MASK "2097136") #0x1FFFF0
configure_file("src/crypto/asm/cnv2_main_loop_rwz_all.inc.in" "src/crypto/asm/cnv2_main_loop_rwz_original_all.inc")
configure_file("src/crypto/asm/cnv2_double_main_loop_rwz_all.inc.in" "src/crypto/asm/cnv2_double_main_loop_rwz_original_all.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_rwz_all.inc.in" "src/crypto/asm/win/cnv2_main_loop_rwz_original_all.inc")
configure_file("src/crypto/asm/win/cnv2_double_main_loop_rwz_all.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_rwz_original_all.inc")
# CN V2 UPX2
set(ALGO "upx2")
set(ITERATIONS "16384") #0x4000
set(MASK "131056") #0x1FFF0
configure_file("src/crypto/asm/cnv2_main_loop_rwz_all.inc.in" "src/crypto/asm/cnv2_main_loop_rwz_upx2_all.inc")
configure_file("src/crypto/asm/cnv2_double_main_loop_rwz_all.inc.in" "src/crypto/asm/cnv2_double_main_loop_rwz_upx2_all.inc")
configure_file("src/crypto/asm/win/cnv2_main_loop_rwz_all.inc.in" "src/crypto/asm/win/cnv2_main_loop_rwz_upx2_all.inc")
configure_file("src/crypto/asm/win/cnv2_double_main_loop_rwz_all.inc.in" "src/crypto/asm/win/cnv2_double_main_loop_rwz_upx2_all.inc")
if (CMAKE_C_COMPILER_ID MATCHES MSVC) if (CMAKE_C_COMPILER_ID MATCHES MSVC)
enable_language(ASM_MASM) enable_language(ASM_MASM)
set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm" set(XMRIG_ASM_FILE "src/crypto/asm/win/cn_main_loop.asm"

View file

@ -67,6 +67,9 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor,
size_t cache = availableCache(); size_t cache = availableCache();
size_t algoBlockSize; size_t algoBlockSize;
switch (algo) { switch (algo) {
case Options::ALGO_CRYPTONIGHT_EXTREMELITE:
algoBlockSize = 128;
break;
case Options::ALGO_CRYPTONIGHT_ULTRALITE: case Options::ALGO_CRYPTONIGHT_ULTRALITE:
algoBlockSize = 256; algoBlockSize = 256;
break; break;

View file

@ -44,6 +44,9 @@ ScratchPadMem Mem::create(ScratchPad** scratchPads, int threadId)
case Options::ALGO_CRYPTONIGHT_ULTRALITE: case Options::ALGO_CRYPTONIGHT_ULTRALITE:
scratchPadSize = MEMORY_ULTRA_LITE; scratchPadSize = MEMORY_ULTRA_LITE;
break; break;
case Options::ALGO_CRYPTONIGHT_EXTREMELITE:
scratchPadSize = MEMORY_EXTREME_LITE;
break;
case Options::ALGO_CRYPTONIGHT_SUPERLITE: case Options::ALGO_CRYPTONIGHT_SUPERLITE:
scratchPadSize = MEMORY_SUPER_LITE; scratchPadSize = MEMORY_SUPER_LITE;
break; break;

View file

@ -308,6 +308,7 @@ static const char *algo_names[] = {
"cryptonight-lite", "cryptonight-lite",
"cryptonight-superlite", "cryptonight-superlite",
"cryptonight-ultralite", "cryptonight-ultralite",
"cryptonight-extremelite",
"cryptonight-heavy" "cryptonight-heavy"
}; };
@ -316,6 +317,7 @@ static const char *algo_short_names[] = {
"cn-lite", "cn-lite",
"cn-superlite", "cn-superlite",
"cn-ultralite", "cn-ultralite",
"cn-extremelite",
"cn-heavy" "cn-heavy"
}; };
@ -337,7 +339,8 @@ constexpr static const char *pow_variant_names[] = {
"hosp", "hosp",
"wow", "wow",
"r", "r",
"xcash" "xcash",
"upx2"
}; };
constexpr static const char *asm_optimization_names[] = { constexpr static const char *asm_optimization_names[] = {
@ -1112,12 +1115,16 @@ bool Options::setAlgo(const char *algo)
break; break;
} }
if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cn-ultra-lite") || !strcmp(algo, "cryptonight-ultra-lite") || !strcmp(algo, "cryptonight-ultralight") || !strcmp(algo, "cryptonight-turtle") || !strcmp(algo, "cn-turtle") || !strcmp(algo, "cryptonight-pico") || !strcmp(algo, "cn-pico"))) { if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cn-ultra-lite") || !strcmp(algo, "cryptonight-ultra-lite") || !strcmp(algo, "cryptonight-ultralight") || !strcmp(algo, "cryptonight-turtle") || !strcmp(algo, "cn-turtle") || !strcmp(algo, "cryptonight-pico") || !strcmp(algo, "cn-pico"))) {
m_algo = ALGO_CRYPTONIGHT_ULTRALITE; m_algo = ALGO_CRYPTONIGHT_ULTRALITE;
break; break;
} }
if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cn-extreme-lite") || !strcmp(algo, "cryptonight-extreme-lite") || !strcmp(algo, "cryptonight-extremelight") || !strcmp(algo, "cryptonight-upx2") || !strcmp(algo, "cn-upx2") || !strcmp(algo, "cryptonight-femto") || !strcmp(algo, "cn-femto"))) {
m_algo = ALGO_CRYPTONIGHT_EXTREMELITE;
break;
}
if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cryptonight-lite-ipbc") || !strcmp(algo, "cryptonight-light-ipbc") || !strcmp(algo, "cn-lite-ipbc"))) { if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cryptonight-lite-ipbc") || !strcmp(algo, "cryptonight-light-ipbc") || !strcmp(algo, "cn-lite-ipbc"))) {
showDeprecateWarning("cryptonight-light-ipbc", "cryptonight-light (with variant \"ipbc\")"); showDeprecateWarning("cryptonight-light-ipbc", "cryptonight-light (with variant \"ipbc\")");
m_algo = ALGO_CRYPTONIGHT_LITE; m_algo = ALGO_CRYPTONIGHT_LITE;
@ -1215,6 +1222,11 @@ bool Options::parsePowVariant(const char *powVariant)
break; break;
} }
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "upx2") || !strcmp(powVariant, "upxv2") || !strcmp(powVariant, "femto"))) {
m_powVariant = POW_UPX2;
break;
}
if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "hosp") || !strcmp(powVariant, "hospital"))) { if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "hosp") || !strcmp(powVariant, "hospital"))) {
m_powVariant = POW_HOSP; m_powVariant = POW_HOSP;
break; break;

View file

@ -50,6 +50,7 @@ public:
ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (1MB ScratchPad) */ ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (1MB ScratchPad) */
ALGO_CRYPTONIGHT_SUPERLITE, /* CryptoNight-Superlite (512KB ScratchPad) */ ALGO_CRYPTONIGHT_SUPERLITE, /* CryptoNight-Superlite (512KB ScratchPad) */
ALGO_CRYPTONIGHT_ULTRALITE, /* CryptoNight-Ultralite (256KB ScratchPad) */ ALGO_CRYPTONIGHT_ULTRALITE, /* CryptoNight-Ultralite (256KB ScratchPad) */
ALGO_CRYPTONIGHT_EXTREMELITE, /* CryptoNight-Verylite (128KB ScratchPad) */
ALGO_CRYPTONIGHT_HEAVY, /* CryptoNight-Heavy (4MB ScratchPad) */ ALGO_CRYPTONIGHT_HEAVY, /* CryptoNight-Heavy (4MB ScratchPad) */
}; };

View file

@ -44,6 +44,7 @@ enum PowVariant
POW_DOUBLE, POW_DOUBLE,
POW_ZELERIUS, POW_ZELERIUS,
POW_RWZ, POW_RWZ,
POW_UPX2,
LAST_ITEM LAST_ITEM
}; };
@ -89,6 +90,8 @@ inline std::string getPowVariantName(PowVariant powVariant)
return "zls"; return "zls";
case POW_RWZ: case POW_RWZ:
return "rwz"; return "rwz";
case POW_UPX2:
return "upx2";
case POW_AUTODETECT: case POW_AUTODETECT:
default: default:
return "-1"; return "-1";
@ -174,6 +177,8 @@ inline PowVariant parseVariant(const std::string variant)
powVariant = PowVariant::POW_ZELERIUS; powVariant = PowVariant::POW_ZELERIUS;
} else if (variant == "rwz" || variant == "graft") { } else if (variant == "rwz" || variant == "graft") {
powVariant = PowVariant::POW_RWZ; powVariant = PowVariant::POW_RWZ;
} else if (variant == "upx2") {
powVariant = PowVariant::POW_UPX2;
} }
return powVariant; return powVariant;

View file

@ -398,6 +398,26 @@ static void cryptonight_ultra_lite_softaes(AsmOptimization asmOptimization, uint
#endif #endif
} }
template <size_t NUM_HASH_BLOCKS>
static void cryptonight_extreme_lite_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
# if !defined(XMRIG_ARMv7)
#if defined(XMRIG_ARM)
CryptoNightMultiHash<0x4000, POW_DEFAULT_INDEX_SHIFT, MEMORY_EXTREME_LITE, 0x1FFF0, false, POW_UPX2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
#else
if ((asmOptimization != AsmOptimization::ASM_OFF && NUM_HASH_BLOCKS <= 2)) {
CryptoNightMultiHash<0x4000, POW_DEFAULT_INDEX_SHIFT, MEMORY_EXTREME_LITE, 0x1FFF0, false, POW_UPX2, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
} else {
CryptoNightMultiHash<0x4000, POW_DEFAULT_INDEX_SHIFT, MEMORY_EXTREME_LITE, 0x1FFF0, false, POW_UPX2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
}
#endif
# endif
}
template <size_t NUM_HASH_BLOCKS>
static void cryptonight_extreme_lite_softaes(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
CryptoNightMultiHash<0x4000, POW_DEFAULT_INDEX_SHIFT, MEMORY_EXTREME_LITE, 0x1FFF0, true, POW_UPX2, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
}
template <size_t NUM_HASH_BLOCKS> template <size_t NUM_HASH_BLOCKS>
static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, uint64_t height, PowVariant variant, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
# if !defined(XMRIG_ARMv7) # if !defined(XMRIG_ARMv7)
@ -464,6 +484,14 @@ void setCryptoNightHashMethods(Options::Algo algo, bool aesni)
} }
break; break;
case Options::ALGO_CRYPTONIGHT_EXTREMELITE:
if (aesni) {
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_extreme_lite_aesni<HASH_FACTOR>;
} else {
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_extreme_lite_softaes<HASH_FACTOR>;
}
break;
case Options::ALGO_CRYPTONIGHT_HEAVY: case Options::ALGO_CRYPTONIGHT_HEAVY:
if (aesni) { if (aesni) {
cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_heavy_aesni<HASH_FACTOR>; cryptonight_hash_ctx[HASH_FACTOR - 1] = cryptonight_heavy_aesni<HASH_FACTOR>;
@ -546,6 +574,7 @@ bool CryptoNight::selfCheck(int algo)
bool resultLite = true; bool resultLite = true;
bool resultSuperLite = true; bool resultSuperLite = true;
bool resultUltraLite = true; bool resultUltraLite = true;
bool resultExtremeLite = true;
bool resultHeavy = true; bool resultHeavy = true;
AsmOptimization asmOptimization = Options::i()->asmOptimization(); AsmOptimization asmOptimization = Options::i()->asmOptimization();
@ -678,9 +707,7 @@ bool CryptoNight::selfCheck(int algo)
resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0; resultLite = resultLite && memcmp(output, test_output_upx, 32) == 0;
} else if (algo == Options::ALGO_CRYPTONIGHT_SUPERLITE) { } else if (algo == Options::ALGO_CRYPTONIGHT_SUPERLITE) {
return false; return false;
} else if (algo == Options::ALGO_CRYPTONIGHT_ULTRALITE) { } else if (algo == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
// cn ultralite (cnv8 + turtle) // cn ultralite (cnv8 + turtle)
@ -691,6 +718,16 @@ bool CryptoNight::selfCheck(int algo)
cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_TURTLE, test_input, 76, output, scratchPads); cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_TURTLE, test_input, 76, output, scratchPads);
resultUltraLite = resultUltraLite && memcmp(output, test_output_turtle, 64) == 0; resultUltraLite = resultUltraLite && memcmp(output, test_output_turtle, 64) == 0;
#endif #endif
} else if (algo == Options::ALGO_CRYPTONIGHT_EXTREMELITE) {
// cn extremelite (cnv8 + upx2)
cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_UPX2, test_input, 76, output, scratchPads);
resultExtremeLite = resultExtremeLite && memcmp(output, test_output_upx2, 32) == 0;
#if MAX_NUM_HASH_BLOCKS > 1
cryptonight_hash_ctx[1](asmOptimization, 0, PowVariant::POW_UPX2, test_input, 76, output, scratchPads);
resultExtremeLite = resultExtremeLite && memcmp(output, test_output_upx2, 64) == 0;
#endif
} else { } else {
// cn v0 aka orignal // cn v0 aka orignal
cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V0,test_input, 76, output, scratchPads); cryptonight_hash_ctx[0](asmOptimization, 0, PowVariant::POW_V0,test_input, 76, output, scratchPads);
@ -858,5 +895,5 @@ bool CryptoNight::selfCheck(int algo)
_mm_free(scratchPads[i]); _mm_free(scratchPads[i]);
} }
return result && resultLite && resultSuperLite && resultUltraLite && resultHeavy; return result && resultLite && resultSuperLite && resultUltraLite && resultExtremeLite && resultHeavy;
} }

View file

@ -37,6 +37,7 @@
#define MEMORY_LITE 1048576 /* 1 MiB */ #define MEMORY_LITE 1048576 /* 1 MiB */
#define MEMORY_SUPER_LITE 524288 /* 512 KiB */ #define MEMORY_SUPER_LITE 524288 /* 512 KiB */
#define MEMORY_ULTRA_LITE 262144 /* 256 KiB */ #define MEMORY_ULTRA_LITE 262144 /* 256 KiB */
#define MEMORY_EXTREME_LITE 131072 /* 128 KiB */
#define MEMORY_HEAVY 4194304 /* 4 MiB */ #define MEMORY_HEAVY 4194304 /* 4 MiB */
#define POW_DEFAULT_INDEX_SHIFT 3 #define POW_DEFAULT_INDEX_SHIFT 3

View file

@ -915,7 +915,7 @@ public:
cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
@ -929,7 +929,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -1533,8 +1533,8 @@ public:
cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -1550,7 +1550,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -1573,7 +1573,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -2479,9 +2479,9 @@ public:
cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -2499,7 +2499,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -2522,7 +2522,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -2545,7 +2545,7 @@ public:
lo = __umul128(idx2, cl, &hi); lo = __umul128(idx2, cl, &hi);
SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al2 += hi; al2 += hi;
ah2 += lo; ah2 += lo;
@ -3750,10 +3750,10 @@ public:
cx3 = _mm_aesenc_si128(cx3, ax3); cx3 = _mm_aesenc_si128(cx3, ax3);
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -3773,7 +3773,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -3796,7 +3796,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -3819,7 +3819,7 @@ public:
lo = __umul128(idx2, cl, &hi); lo = __umul128(idx2, cl, &hi);
SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al2 += hi; al2 += hi;
ah2 += lo; ah2 += lo;
@ -3842,7 +3842,7 @@ public:
lo = __umul128(idx3, cl, &hi); lo = __umul128(idx3, cl, &hi);
SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al3 += hi; al3 += hi;
ah3 += lo; ah3 += lo;
@ -4845,11 +4845,11 @@ public:
cx4 = _mm_aesenc_si128(cx4, ax4); cx4 = _mm_aesenc_si128(cx4, ax4);
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -4871,7 +4871,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -4894,7 +4894,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -4917,7 +4917,7 @@ public:
lo = __umul128(idx2, cl, &hi); lo = __umul128(idx2, cl, &hi);
SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al2 += hi; al2 += hi;
ah2 += lo; ah2 += lo;
@ -4940,7 +4940,7 @@ public:
lo = __umul128(idx3, cl, &hi); lo = __umul128(idx3, cl, &hi);
SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al3 += hi; al3 += hi;
ah3 += lo; ah3 += lo;
@ -4963,7 +4963,7 @@ public:
lo = __umul128(idx4, cl, &hi); lo = __umul128(idx4, cl, &hi);
SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al4 += hi; al4 += hi;
ah4 += lo; ah4 += lo;

View file

@ -273,4 +273,13 @@ const static uint8_t test_output_turtle[64] = {
0xE3, 0x54, 0x58, 0x2B, 0xCB, 0x93, 0xF8, 0x69, 0xD4, 0x29, 0x74, 0x4D, 0xE5, 0x72, 0x6A, 0x26 0xE3, 0x54, 0x58, 0x2B, 0xCB, 0x93, 0xF8, 0x69, 0xD4, 0x29, 0x74, 0x4D, 0xE5, 0x72, 0x6A, 0x26
}; };
// CN-Extremelite/UPX2
const static uint8_t test_output_upx2[64] = {
0xAA, 0xBB, 0xB8, 0xED, 0x14, 0xA8, 0x35, 0xFA, 0x22, 0xCF, 0xB1, 0xB5, 0xDE, 0xA8, 0x72, 0xB0,
0xA1, 0xD6, 0xCB, 0xD8, 0x46, 0xF4, 0x39, 0x1C, 0x0F, 0x01, 0xF3, 0x87, 0x5E, 0x3A, 0x37, 0x61,
0x38, 0x59, 0x15, 0x72, 0xF8, 0x20, 0xD4, 0xDE, 0x25, 0x3C, 0xF5, 0x5A, 0x21, 0x92, 0xB6, 0x22,
0xB0, 0x28, 0x9E, 0x2E, 0x5C, 0x36, 0x16, 0xE6, 0x1E, 0x78, 0x7A, 0x8F, 0xE4, 0x62, 0xEC, 0x5A
};
#endif /* __CRYPTONIGHT_TEST_H__ */ #endif /* __CRYPTONIGHT_TEST_H__ */

View file

@ -95,8 +95,11 @@ extern "C"
void cnv2_main_loop_zelerius_bulldozer_asm(ScratchPad* ctx0); void cnv2_main_loop_zelerius_bulldozer_asm(ScratchPad* ctx0);
void cnv2_double_main_loop_zelerius_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv2_double_main_loop_zelerius_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
void cnv2_main_loop_rwz_all_asm(ScratchPad* ctx0); void cnv2_main_loop_rwz_original_all_asm(ScratchPad* ctx0);
void cnv2_double_main_loop_rwz_all_asm(ScratchPad* ctx0, ScratchPad* ctx1); void cnv2_double_main_loop_rwz_original_all_asm(ScratchPad* ctx0, ScratchPad* ctx1);
void cnv2_main_loop_rwz_upx2_all_asm(ScratchPad* ctx0);
void cnv2_double_main_loop_rwz_upx2_all_asm(ScratchPad* ctx0, ScratchPad* ctx1);
void cnv1_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
void cnv1_main_loop_lite_soft_aes_sandybridge_asm(ScratchPad* ctx0); void cnv1_main_loop_lite_soft_aes_sandybridge_asm(ScratchPad* ctx0);
@ -994,7 +997,7 @@ public:
cx = _mm_aesenc_si128(cx, ax); cx = _mm_aesenc_si128(cx, ax);
} }
SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx)); _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx));
@ -1008,7 +1011,7 @@ public:
lo = __umul128(idx, cl, &hi); lo = __umul128(idx, cl, &hi);
SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al += hi; // two fence statements are overhead al += hi; // two fence statements are overhead
ah += lo; ah += lo;
@ -1082,7 +1085,10 @@ public:
cnv2_main_loop_zelerius_ivybridge_asm(scratchPad[0]); cnv2_main_loop_zelerius_ivybridge_asm(scratchPad[0]);
break; break;
case POW_RWZ: case POW_RWZ:
cnv2_main_loop_rwz_all_asm(scratchPad[0]); cnv2_main_loop_rwz_original_all_asm(scratchPad[0]);
break;
case POW_UPX2:
cnv2_main_loop_rwz_upx2_all_asm(scratchPad[0]);
break; break;
default: default:
cnv2_main_loop_ivybridge_asm(scratchPad[0]); cnv2_main_loop_ivybridge_asm(scratchPad[0]);
@ -1105,7 +1111,10 @@ public:
cnv2_main_loop_zelerius_ryzen_asm(scratchPad[0]); cnv2_main_loop_zelerius_ryzen_asm(scratchPad[0]);
break; break;
case POW_RWZ: case POW_RWZ:
cnv2_main_loop_rwz_all_asm(scratchPad[0]); cnv2_main_loop_rwz_original_all_asm(scratchPad[0]);
break;
case POW_UPX2:
cnv2_main_loop_rwz_upx2_all_asm(scratchPad[0]);
break; break;
default: default:
cnv2_main_loop_ryzen_asm(scratchPad[0]); cnv2_main_loop_ryzen_asm(scratchPad[0]);
@ -1127,7 +1136,10 @@ public:
cnv2_main_loop_zelerius_bulldozer_asm(scratchPad[0]); cnv2_main_loop_zelerius_bulldozer_asm(scratchPad[0]);
break; break;
case POW_RWZ: case POW_RWZ:
cnv2_main_loop_rwz_all_asm(scratchPad[0]); cnv2_main_loop_rwz_original_all_asm(scratchPad[0]);
break;
case POW_UPX2:
cnv2_main_loop_rwz_upx2_all_asm(scratchPad[0]);
break; break;
default: default:
cnv2_main_loop_bulldozer_asm(scratchPad[0]); cnv2_main_loop_bulldozer_asm(scratchPad[0]);
@ -1785,8 +1797,8 @@ public:
cx1 = _mm_aesenc_si128(cx1, ax1); cx1 = _mm_aesenc_si128(cx1, ax1);
} }
SHUFFLE_PHASE_1(l0, (idx0 & MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0 & MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1 & MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1 & MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -1807,7 +1819,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0 & MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0 & MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -1876,7 +1888,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1 & MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1 & MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -1935,7 +1947,10 @@ public:
cnv2_double_main_loop_zelerius_sandybridge_asm(scratchPad[0], scratchPad[1]); cnv2_double_main_loop_zelerius_sandybridge_asm(scratchPad[0], scratchPad[1]);
break; break;
case POW_RWZ: case POW_RWZ:
cnv2_double_main_loop_rwz_all_asm(scratchPad[0], scratchPad[1]); cnv2_double_main_loop_rwz_original_all_asm(scratchPad[0], scratchPad[1]);
break;
case POW_UPX2:
cnv2_double_main_loop_rwz_upx2_all_asm(scratchPad[0], scratchPad[1]);
break; break;
default: default:
cnv2_double_main_loop_sandybridge_asm(scratchPad[0], scratchPad[1]); cnv2_double_main_loop_sandybridge_asm(scratchPad[0], scratchPad[1]);
@ -2885,9 +2900,9 @@ public:
cx2 = _mm_aesenc_si128(cx2, ax2); cx2 = _mm_aesenc_si128(cx2, ax2);
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -2905,7 +2920,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -2928,7 +2943,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -2950,7 +2965,7 @@ public:
lo = __umul128(idx2, cl, &hi); lo = __umul128(idx2, cl, &hi);
SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al2 += hi; al2 += hi;
ah2 += lo; ah2 += lo;
@ -4167,10 +4182,10 @@ public:
cx3 = _mm_aesenc_si128(cx3, ax3); cx3 = _mm_aesenc_si128(cx3, ax3);
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -4190,7 +4205,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -4213,7 +4228,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -4236,7 +4251,7 @@ public:
lo = __umul128(idx2, cl, &hi); lo = __umul128(idx2, cl, &hi);
SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al2 += hi; al2 += hi;
ah2 += lo; ah2 += lo;
@ -4259,7 +4274,7 @@ public:
lo = __umul128(idx3, cl, &hi); lo = __umul128(idx3, cl, &hi);
SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al3 += hi; al3 += hi;
ah3 += lo; ah3 += lo;
@ -5297,11 +5312,11 @@ public:
cx4 = _mm_aesenc_si128(cx4, ax4); cx4 = _mm_aesenc_si128(cx4, ax4);
} }
SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4, VARIANT == POW_RWZ) SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
_mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0)); _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
_mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1)); _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
@ -5323,7 +5338,7 @@ public:
lo = __umul128(idx0, cl, &hi); lo = __umul128(idx0, cl, &hi);
SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al0 += hi; al0 += hi;
ah0 += lo; ah0 += lo;
@ -5346,7 +5361,7 @@ public:
lo = __umul128(idx1, cl, &hi); lo = __umul128(idx1, cl, &hi);
SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al1 += hi; al1 += hi;
ah1 += lo; ah1 += lo;
@ -5369,7 +5384,7 @@ public:
lo = __umul128(idx2, cl, &hi); lo = __umul128(idx2, cl, &hi);
SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al2 += hi; al2 += hi;
ah2 += lo; ah2 += lo;
@ -5392,7 +5407,7 @@ public:
lo = __umul128(idx3, cl, &hi); lo = __umul128(idx3, cl, &hi);
SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al3 += hi; al3 += hi;
ah3 += lo; ah3 += lo;
@ -5415,7 +5430,7 @@ public:
lo = __umul128(idx4, cl, &hi); lo = __umul128(idx4, cl, &hi);
SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi, VARIANT == POW_RWZ) SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi, VARIANT == POW_RWZ || VARIANT == POW_UPX2)
al4 += hi; al4 += hi;
ah4 += lo; ah4 += lo;

View file

@ -38,8 +38,11 @@
.global FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm) .global FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm)
.global FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm) .global FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_rwz_all_asm) .global FN_PREFIX(cnv2_main_loop_rwz_original_all_asm)
.global FN_PREFIX(cnv2_double_main_loop_rwz_all_asm) .global FN_PREFIX(cnv2_double_main_loop_rwz_original_all_asm)
.global FN_PREFIX(cnv2_main_loop_rwz_upx2_all_asm)
.global FN_PREFIX(cnv2_double_main_loop_rwz_upx2_all_asm)
.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm)
@ -363,10 +366,10 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv2_main_loop_rwz_all_asm): FN_PREFIX(cnv2_main_loop_rwz_original_all_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
#include "cnv2_main_loop_rwz_all.inc" #include "cnv2_main_loop_rwz_original_all.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0
@ -375,11 +378,36 @@ ALIGN 16
#else #else
ALIGN 64 ALIGN 64
#endif #endif
FN_PREFIX(cnv2_double_main_loop_rwz_all_asm): FN_PREFIX(cnv2_double_main_loop_rwz_original_all_asm):
sub rsp, 48 sub rsp, 48
mov rcx, rdi mov rcx, rdi
mov rdx, rsi mov rdx, rsi
#include "cnv2_double_main_loop_rwz_all.inc" #include "cnv2_double_main_loop_rwz_original_all.inc"
add rsp, 48
ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cnv2_main_loop_rwz_upx2_all_asm):
sub rsp, 48
mov rcx, rdi
#include "cnv2_main_loop_rwz_upx2_all.inc"
add rsp, 48
ret 0
#ifdef __APPLE__
ALIGN 16
#else
ALIGN 64
#endif
FN_PREFIX(cnv2_double_main_loop_rwz_upx2_all_asm):
sub rsp, 48
mov rcx, rdi
mov rdx, rsi
#include "cnv2_double_main_loop_rwz_upx2_all.inc"
add rsp, 48 add rsp, 48
ret 0 ret 0

View file

@ -18,7 +18,7 @@
mov r10, QWORD PTR [rcx+32] mov r10, QWORD PTR [rcx+32]
mov r8, rcx mov r8, rcx
xor r10, QWORD PTR [rcx] xor r10, QWORD PTR [rcx]
mov r14d, 393216 mov r14d, ${ITERATIONS}
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8] xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224] mov rsi, QWORD PTR [rdx+224]
@ -41,7 +41,7 @@
movaps XMMWORD PTR [rsp+16], xmm15 movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10 mov rdx, r10
movq xmm4, QWORD PTR [r8+96] movq xmm4, QWORD PTR [r8+96]
and edx, 2097136 and edx, ${MASK}
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13 xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
@ -83,7 +83,7 @@
mov rcx, rdi mov rcx, rdi
mov QWORD PTR [rsp+264], r11 mov QWORD PTR [rsp+264], r11
movq xmm8, rax movq xmm8, rax
and ecx, 2097136 and ecx, ${MASK}
punpcklqdq xmm8, xmm0 punpcklqdq xmm8, xmm0
movq xmm0, QWORD PTR [r9+96] movq xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
@ -99,7 +99,7 @@
#else #else
ALIGN(64) ALIGN(64)
#endif #endif
rwz_main_loop_double: rwz_main_loop_double_${ALGO}:
movdqu xmm9, xmm15 movdqu xmm9, xmm15
mov eax, edx mov eax, edx
mov ebx, edx mov ebx, edx
@ -124,7 +124,7 @@ rwz_main_loop_double:
movq r11, xmm9 movq r11, xmm9
mov edx, r11d mov edx, r11d
and edx, 2097136 and edx, ${MASK}
movdqa xmm0, xmm9 movdqa xmm0, xmm9
pxor xmm0, xmm7 pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0 movdqu XMMWORD PTR [r9], xmm0
@ -155,7 +155,7 @@ rwz_main_loop_double:
movdqu XMMWORD PTR [rax+rsi], xmm0 movdqu XMMWORD PTR [rax+rsi], xmm0
movq rcx, xmm10 movq rcx, xmm10
and ecx, 2097136 and ecx, ${MASK}
movdqa xmm0, xmm10 movdqa xmm0, xmm10
pxor xmm0, xmm6 pxor xmm0, xmm6
@ -203,7 +203,7 @@ rwz_main_loop_double:
mov QWORD PTR [rbx+8], rdx mov QWORD PTR [rbx+8], rdx
xor rdx, r9 xor rdx, r9
mov QWORD PTR [rsp+256], r11 mov QWORD PTR [rsp+256], r11
and r11d, 2097136 and r11d, ${MASK}
mov QWORD PTR [rsp+264], rdx mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11 mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13] lea r15, QWORD PTR [r11+r13]
@ -253,8 +253,8 @@ rwz_main_loop_double:
mov rbx, rax mov rbx, rax
imul rax, rdx imul rax, rdx
sub r11, rax sub r11, rax
js rwz_div_fix_1 js rwz_div_fix_1_${ALGO}
rwz_div_fix_1_ret: rwz_div_fix_1_${ALGO}_ret:
cvttsd2si rdx, xmm2 cvttsd2si rdx, xmm2
mov rax, rdx mov rax, rdx
@ -262,8 +262,8 @@ rwz_div_fix_1_ret:
movd xmm2, r11d movd xmm2, r11d
movd xmm4, ebx movd xmm4, ebx
sub r8, rax sub r8, rax
js rwz_div_fix_2 js rwz_div_fix_2_${ALGO}
rwz_div_fix_2_ret: rwz_div_fix_2_${ALGO}_ret:
movd xmm1, r8d movd xmm1, r8d
movd xmm0, edx movd xmm0, edx
@ -279,15 +279,15 @@ rwz_div_fix_2_ret:
movdqa xmm5, xmm1 movdqa xmm5, xmm1
psrlq xmm5, 19 psrlq xmm5, 19
test r9, 524287 test r9, 524287
je rwz_sqrt_fix_1 je rwz_sqrt_fix_1_${ALGO}
rwz_sqrt_fix_1_ret: rwz_sqrt_fix_1_${ALGO}_ret:
movq r9, xmm10 movq r9, xmm10
psrldq xmm1, 8 psrldq xmm1, 8
movq r8, xmm1 movq r8, xmm1
test r8, 524287 test r8, 524287
je rwz_sqrt_fix_2 je rwz_sqrt_fix_2_${ALGO}
rwz_sqrt_fix_2_ret: rwz_sqrt_fix_2_${ALGO}_ret:
mov r12d, ecx mov r12d, ecx
mov r8d, ecx mov r8d, ecx
@ -317,7 +317,7 @@ rwz_sqrt_fix_2_ret:
mov QWORD PTR [r13], rdi mov QWORD PTR [r13], rdi
xor rdi, r10 xor rdi, r10
mov ecx, edi mov ecx, edi
and ecx, 2097136 and ecx, ${MASK}
lea r8, QWORD PTR [rcx+rsi] lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8] mov rdx, QWORD PTR [r13+8]
@ -335,7 +335,7 @@ rwz_sqrt_fix_2_ret:
movdqa xmm6, xmm10 movdqa xmm6, xmm10
mov r9, r15 mov r9, r15
dec r14d dec r14d
jne rwz_main_loop_double jne rwz_main_loop_double_${ALGO}
ldmxcsr DWORD PTR [rsp+272] ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48] movaps xmm13, XMMWORD PTR [rsp+48]
@ -358,19 +358,19 @@ rwz_sqrt_fix_2_ret:
pop rsi pop rsi
pop rbp pop rbp
pop rbx pop rbx
jmp rwz_cnv2_double_mainloop_asm_endp jmp rwz_cnv2_double_mainloop_${ALGO}_asm_endp
rwz_div_fix_1: rwz_div_fix_1_${ALGO}:
dec rbx dec rbx
add r11, rdx add r11, rdx
jmp rwz_div_fix_1_ret jmp rwz_div_fix_1_${ALGO}_ret
rwz_div_fix_2: rwz_div_fix_2_${ALGO}:
dec rdx dec rdx
add r8, r9 add r8, r9
jmp rwz_div_fix_2_ret jmp rwz_div_fix_2_${ALGO}_ret
rwz_sqrt_fix_1: rwz_sqrt_fix_1_${ALGO}:
movq r8, xmm3 movq r8, xmm3
movdqa xmm0, xmm5 movdqa xmm0, xmm5
psrldq xmm0, 8 psrldq xmm0, 8
@ -389,9 +389,9 @@ rwz_sqrt_fix_1:
adc r9, 0 adc r9, 0
movq xmm5, r9 movq xmm5, r9
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_1_ret jmp rwz_sqrt_fix_1_${ALGO}_ret
rwz_sqrt_fix_2: rwz_sqrt_fix_2_${ALGO}:
psrldq xmm3, 8 psrldq xmm3, 8
movq r11, xmm3 movq r11, xmm3
dec r8 dec r8
@ -409,6 +409,6 @@ rwz_sqrt_fix_2:
adc r8, 0 adc r8, 0
movq xmm0, r8 movq xmm0, r8
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_2_ret jmp rwz_sqrt_fix_2_${ALGO}_ret
rwz_cnv2_double_mainloop_asm_endp: rwz_cnv2_double_mainloop_${ALGO}_asm_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov esi, 393216 mov esi, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647 mov r13d, -2147483647
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
@ -35,7 +35,7 @@
movaps XMMWORD PTR [rsp+64], xmm6 movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7 movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8 movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136 and r10d, ${MASK}
movq xmm5, rax movq xmm5, rax
xor eax, eax xor eax, eax
@ -55,7 +55,7 @@
#else #else
ALIGN(64) ALIGN(64)
#endif #endif
rwz_main_loop: rwz_main_loop_${ALGO}:
lea rdx, QWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d mov ecx, r10d
mov eax, r10d mov eax, r10d
@ -69,7 +69,7 @@ rwz_main_loop:
aesenc xmm6, xmm7 aesenc xmm6, xmm7
movq rbp, xmm6 movq rbp, xmm6
mov r9, rbp mov r9, rbp
and r9d, 2097136 and r9d, ${MASK}
movdqu xmm0, XMMWORD PTR [rcx+rbx] movdqu xmm0, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm2, XMMWORD PTR [r10+rbx] movdqu xmm2, XMMWORD PTR [r10+rbx]
@ -113,9 +113,9 @@ rwz_main_loop:
psubq xmm3, XMMWORD PTR [rsp+16] psubq xmm3, XMMWORD PTR [rsp+16]
movq rdx, xmm3 movq rdx, xmm3
test edx, 524287 test edx, 524287
je rwz_sqrt_fixup je rwz_sqrt_fixup_${ALGO}
psrlq xmm3, 19 psrlq xmm3, 19
rwz_sqrt_fixup_ret: rwz_sqrt_fixup_${ALGO}_ret:
mov ecx, r10d mov ecx, r10d
mov rax, rdi mov rax, rdi
@ -126,7 +126,7 @@ rwz_sqrt_fixup_ret:
mov QWORD PTR [r14], r8 mov QWORD PTR [r14], r8
xor r8, rdi xor r8, rdi
mov edi, r8d mov edi, r8d
and edi, 2097136 and edi, ${MASK}
movq xmm0, rax movq xmm0, rax
xor rax, [rcx+rbx+8] xor rax, [rcx+rbx+8]
add r11, rax add r11, rax
@ -151,7 +151,7 @@ rwz_sqrt_fixup_ret:
mov r10d, edi mov r10d, edi
xor r11, r12 xor r11, r12
dec rsi dec rsi
jne rwz_main_loop jne rwz_main_loop_${ALGO}
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160] mov rbx, QWORD PTR [rsp+160]
@ -166,9 +166,9 @@ rwz_sqrt_fixup_ret:
pop rdi pop rdi
pop rsi pop rsi
pop rbp pop rbp
jmp cnv2_rwz_main_loop_endp jmp cnv2_rwz_main_loop_${ALGO}_endp
rwz_sqrt_fixup: rwz_sqrt_fixup_${ALGO}:
dec rdx dec rdx
mov r13d, -1022 mov r13d, -1022
shl r13, 32 shl r13, 32
@ -185,6 +185,6 @@ rwz_sqrt_fixup:
sub rcx, r9 sub rcx, r9
adc rdx, 0 adc rdx, 0
movq xmm3, rdx movq xmm3, rdx
jmp rwz_sqrt_fixup_ret jmp rwz_sqrt_fixup_${ALGO}_ret
cnv2_rwz_main_loop_endp: cnv2_rwz_main_loop_${ALGO}_endp:

View file

@ -31,8 +31,11 @@ PUBLIC cnv2_main_loop_zelerius_ryzen_asm
PUBLIC cnv2_main_loop_zelerius_bulldozer_asm PUBLIC cnv2_main_loop_zelerius_bulldozer_asm
PUBLIC cnv2_double_main_loop_zelerius_sandybridge_asm PUBLIC cnv2_double_main_loop_zelerius_sandybridge_asm
PUBLIC cnv2_main_loop_rwz_all_asm PUBLIC cnv2_main_loop_rwz_original_all_asm
PUBLIC cnv2_double_main_loop_rwz_all_asm PUBLIC cnv2_double_main_loop_rwz_original_all_asm
PUBLIC cnv2_main_loop_rwz_upx2_all_asm
PUBLIC cnv2_double_main_loop_rwz_upx2_all_asm
PUBLIC cnv1_main_loop_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_soft_aes_sandybridge_asm
PUBLIC cnv1_main_loop_lite_soft_aes_sandybridge_asm PUBLIC cnv1_main_loop_lite_soft_aes_sandybridge_asm
@ -197,16 +200,28 @@ cnv2_double_main_loop_zelerius_sandybridge_asm PROC
cnv2_double_main_loop_zelerius_sandybridge_asm ENDP cnv2_double_main_loop_zelerius_sandybridge_asm ENDP
ALIGN 64 ALIGN 64
cnv2_main_loop_rwz_all_asm PROC cnv2_main_loop_rwz_original_all_asm PROC
INCLUDE cnv2_main_loop_rwz_all.inc INCLUDE cnv2_main_loop_rwz_original_all.inc
ret 0 ret 0
cnv2_main_loop_rwz_all_asm ENDP cnv2_main_loop_rwz_original_all_asm ENDP
ALIGN 64 ALIGN 64
cnv2_double_main_loop_rwz_all_asm PROC cnv2_double_main_loop_rwz_original_all_asm PROC
INCLUDE cnv2_double_main_loop_rwz_all.inc INCLUDE cnv2_double_main_loop_rwz_original_all.inc
ret 0 ret 0
cnv2_double_main_loop_rwz_all_asm ENDP cnv2_double_main_loop_rwz_original_all_asm ENDP
ALIGN 64
cnv2_main_loop_rwz_upx2_all_asm PROC
INCLUDE cnv2_main_loop_rwz_upx2_all.inc
ret 0
cnv2_main_loop_rwz_upx2_all_asm ENDP
ALIGN 64
cnv2_double_main_loop_rwz_upx2_all_asm PROC
INCLUDE cnv2_double_main_loop_rwz_upx2_all.inc
ret 0
cnv2_double_main_loop_rwz_upx2_all_asm ENDP
ALIGN 64 ALIGN 64
cnv1_main_loop_soft_aes_sandybridge_asm PROC cnv1_main_loop_soft_aes_sandybridge_asm PROC

View file

@ -34,8 +34,11 @@
.global FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm) .global FN_PREFIX(cnv2_main_loop_zelerius_bulldozer_asm)
.global FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm) .global FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm)
.global FN_PREFIX(cnv2_main_loop_rwz_all_asm) .global FN_PREFIX(cnv2_main_loop_rwz_original_all_asm)
.global FN_PREFIX(cnv2_double_main_loop_rwz_all_asm) .global FN_PREFIX(cnv2_double_main_loop_rwz_original_all_asm)
.global FN_PREFIX(cnv2_main_loop_rwz_upx2_all_asm)
.global FN_PREFIX(cnv2_double_main_loop_rwz_upx2_all_asm)
.global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_soft_aes_sandybridge_asm)
.global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm) .global FN_PREFIX(cnv1_main_loop_lite_soft_aes_sandybridge_asm)
@ -175,13 +178,23 @@ FN_PREFIX(cnv2_double_main_loop_zelerius_sandybridge_asm):
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_main_loop_rwz_all_asm): FN_PREFIX(cnv2_main_loop_rwz_original_all_asm):
#include "../cnv2_main_loop_rwz_all.inc" #include "../cnv2_main_loop_rwz_original_all.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64
FN_PREFIX(cnv2_double_main_loop_rwz_all_asm): FN_PREFIX(cnv2_double_main_loop_rwz_original_all_asm):
#include "../cnv2_double_main_loop_rwz_all.inc" #include "../cnv2_double_main_loop_rwz_original_all.inc"
ret 0
ALIGN 64
FN_PREFIX(cnv2_main_loop_rwz_upx2_all_asm):
#include "../cnv2_main_loop_rwz_upx2_all.inc"
ret 0
ALIGN 64
FN_PREFIX(cnv2_double_main_loop_rwz_upx2_all_asm):
#include "../cnv2_double_main_loop_rwz_upx2_all.inc"
ret 0 ret 0
ALIGN 64 ALIGN 64

View file

@ -18,7 +18,7 @@
mov r10, QWORD PTR [rcx+32] mov r10, QWORD PTR [rcx+32]
mov r8, rcx mov r8, rcx
xor r10, QWORD PTR [rcx] xor r10, QWORD PTR [rcx]
mov r14d, 393216 mov r14d, ${ITERATIONS}
mov r11, QWORD PTR [rcx+40] mov r11, QWORD PTR [rcx+40]
xor r11, QWORD PTR [rcx+8] xor r11, QWORD PTR [rcx+8]
mov rsi, QWORD PTR [rdx+224] mov rsi, QWORD PTR [rdx+224]
@ -41,7 +41,7 @@
movaps XMMWORD PTR [rsp+16], xmm15 movaps XMMWORD PTR [rsp+16], xmm15
mov rdx, r10 mov rdx, r10
movd xmm4, QWORD PTR [r8+96] movd xmm4, QWORD PTR [r8+96]
and edx, 2097136 and edx, ${MASK}
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
xorps xmm13, xmm13 xorps xmm13, xmm13
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
@ -83,7 +83,7 @@
mov rcx, rdi mov rcx, rdi
mov QWORD PTR [rsp+264], r11 mov QWORD PTR [rsp+264], r11
movd xmm8, rax movd xmm8, rax
and ecx, 2097136 and ecx, ${MASK}
punpcklqdq xmm8, xmm0 punpcklqdq xmm8, xmm0
movd xmm0, QWORD PTR [r9+96] movd xmm0, QWORD PTR [r9+96]
punpcklqdq xmm4, xmm0 punpcklqdq xmm4, xmm0
@ -95,7 +95,7 @@
movdqu xmm15, XMMWORD PTR [r9] movdqu xmm15, XMMWORD PTR [r9]
ALIGN(64) ALIGN(64)
rwz_main_loop_double: rwz_main_loop_double_${ALGO}:
movdqu xmm9, xmm15 movdqu xmm9, xmm15
mov eax, edx mov eax, edx
mov ebx, edx mov ebx, edx
@ -120,7 +120,7 @@ rwz_main_loop_double:
movd r11, xmm9 movd r11, xmm9
mov edx, r11d mov edx, r11d
and edx, 2097136 and edx, ${MASK}
movdqa xmm0, xmm9 movdqa xmm0, xmm9
pxor xmm0, xmm7 pxor xmm0, xmm7
movdqu XMMWORD PTR [r9], xmm0 movdqu XMMWORD PTR [r9], xmm0
@ -151,7 +151,7 @@ rwz_main_loop_double:
movdqu XMMWORD PTR [rax+rsi], xmm0 movdqu XMMWORD PTR [rax+rsi], xmm0
movd rcx, xmm10 movd rcx, xmm10
and ecx, 2097136 and ecx, ${MASK}
movdqa xmm0, xmm10 movdqa xmm0, xmm10
pxor xmm0, xmm6 pxor xmm0, xmm6
@ -199,7 +199,7 @@ rwz_main_loop_double:
mov QWORD PTR [rbx+8], rdx mov QWORD PTR [rbx+8], rdx
xor rdx, r9 xor rdx, r9
mov QWORD PTR [rsp+256], r11 mov QWORD PTR [rsp+256], r11
and r11d, 2097136 and r11d, ${MASK}
mov QWORD PTR [rsp+264], rdx mov QWORD PTR [rsp+264], rdx
mov QWORD PTR [rsp+8], r11 mov QWORD PTR [rsp+8], r11
lea r15, QWORD PTR [r11+r13] lea r15, QWORD PTR [r11+r13]
@ -249,8 +249,8 @@ rwz_main_loop_double:
mov rbx, rax mov rbx, rax
imul rax, rdx imul rax, rdx
sub r11, rax sub r11, rax
js rwz_div_fix_1 js rwz_div_fix_1_${ALGO}
rwz_div_fix_1_ret: rwz_div_fix_1_${ALGO}_ret:
cvttsd2si rdx, xmm2 cvttsd2si rdx, xmm2
mov rax, rdx mov rax, rdx
@ -258,8 +258,8 @@ rwz_div_fix_1_ret:
movd xmm2, r11d movd xmm2, r11d
movd xmm4, ebx movd xmm4, ebx
sub r8, rax sub r8, rax
js rwz_div_fix_2 js rwz_div_fix_2_${ALGO}
rwz_div_fix_2_ret: rwz_div_fix_2_${ALGO}_ret:
movd xmm1, r8d movd xmm1, r8d
movd xmm0, edx movd xmm0, edx
@ -275,15 +275,15 @@ rwz_div_fix_2_ret:
movdqa xmm5, xmm1 movdqa xmm5, xmm1
psrlq xmm5, 19 psrlq xmm5, 19
test r9, 524287 test r9, 524287
je rwz_sqrt_fix_1 je rwz_sqrt_fix_1_${ALGO}
rwz_sqrt_fix_1_ret: rwz_sqrt_fix_1_${ALGO}_ret:
movd r9, xmm10 movd r9, xmm10
psrldq xmm1, 8 psrldq xmm1, 8
movd r8, xmm1 movd r8, xmm1
test r8, 524287 test r8, 524287
je rwz_sqrt_fix_2 je rwz_sqrt_fix_2_${ALGO}
rwz_sqrt_fix_2_ret: rwz_sqrt_fix_2_${ALGO}_ret:
mov r12d, ecx mov r12d, ecx
mov r8d, ecx mov r8d, ecx
@ -313,7 +313,7 @@ rwz_sqrt_fix_2_ret:
mov QWORD PTR [r13], rdi mov QWORD PTR [r13], rdi
xor rdi, r10 xor rdi, r10
mov ecx, edi mov ecx, edi
and ecx, 2097136 and ecx, ${MASK}
lea r8, QWORD PTR [rcx+rsi] lea r8, QWORD PTR [rcx+rsi]
mov rdx, QWORD PTR [r13+8] mov rdx, QWORD PTR [r13+8]
@ -331,7 +331,7 @@ rwz_sqrt_fix_2_ret:
movdqa xmm6, xmm10 movdqa xmm6, xmm10
mov r9, r15 mov r9, r15
dec r14d dec r14d
jne rwz_main_loop_double jne rwz_main_loop_double_${ALGO}
ldmxcsr DWORD PTR [rsp+272] ldmxcsr DWORD PTR [rsp+272]
movaps xmm13, XMMWORD PTR [rsp+48] movaps xmm13, XMMWORD PTR [rsp+48]
@ -354,19 +354,19 @@ rwz_sqrt_fix_2_ret:
pop rsi pop rsi
pop rbp pop rbp
pop rbx pop rbx
jmp rwz_cnv2_double_mainloop_asm_endp jmp rwz_cnv2_double_mainloop_${ALGO}_asm_endp
rwz_div_fix_1: rwz_div_fix_1_${ALGO}:
dec rbx dec rbx
add r11, rdx add r11, rdx
jmp rwz_div_fix_1_ret jmp rwz_div_fix_1_${ALGO}_ret
rwz_div_fix_2: rwz_div_fix_2_${ALGO}:
dec rdx dec rdx
add r8, r9 add r8, r9
jmp rwz_div_fix_2_ret jmp rwz_div_fix_2_${ALGO}_ret
rwz_sqrt_fix_1: rwz_sqrt_fix_1_${ALGO}:
movd r8, xmm3 movd r8, xmm3
movdqa xmm0, xmm5 movdqa xmm0, xmm5
psrldq xmm0, 8 psrldq xmm0, 8
@ -385,9 +385,9 @@ rwz_sqrt_fix_1:
adc r9, 0 adc r9, 0
movd xmm5, r9 movd xmm5, r9
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_1_ret jmp rwz_sqrt_fix_1_${ALGO}_ret
rwz_sqrt_fix_2: rwz_sqrt_fix_2_${ALGO}:
psrldq xmm3, 8 psrldq xmm3, 8
movd r11, xmm3 movd r11, xmm3
dec r8 dec r8
@ -405,6 +405,6 @@ rwz_sqrt_fix_2:
adc r8, 0 adc r8, 0
movd xmm0, r8 movd xmm0, r8
punpcklqdq xmm5, xmm0 punpcklqdq xmm5, xmm0
jmp rwz_sqrt_fix_2_ret jmp rwz_sqrt_fix_2_${ALGO}_ret
rwz_cnv2_double_mainloop_asm_endp: rwz_cnv2_double_mainloop_${ALGO}_asm_endp:

View file

@ -15,7 +15,7 @@
mov rax, QWORD PTR [rcx+48] mov rax, QWORD PTR [rcx+48]
mov r9, rcx mov r9, rcx
xor rax, QWORD PTR [rcx+16] xor rax, QWORD PTR [rcx+16]
mov esi, 393216 mov esi, ${ITERATIONS}
mov r8, QWORD PTR [rcx+32] mov r8, QWORD PTR [rcx+32]
mov r13d, -2147483647 mov r13d, -2147483647
xor r8, QWORD PTR [rcx] xor r8, QWORD PTR [rcx]
@ -35,7 +35,7 @@
movaps XMMWORD PTR [rsp+64], xmm6 movaps XMMWORD PTR [rsp+64], xmm6
movaps XMMWORD PTR [rsp+48], xmm7 movaps XMMWORD PTR [rsp+48], xmm7
movaps XMMWORD PTR [rsp+32], xmm8 movaps XMMWORD PTR [rsp+32], xmm8
and r10d, 2097136 and r10d, ${MASK}
movd xmm5, rax movd xmm5, rax
xor eax, eax xor eax, eax
@ -51,7 +51,7 @@
movdqu xmm6, XMMWORD PTR [r10+rbx] movdqu xmm6, XMMWORD PTR [r10+rbx]
ALIGN(64) ALIGN(64)
rwz_main_loop: rwz_main_loop_${ALGO}:
lea rdx, QWORD PTR [r10+rbx] lea rdx, QWORD PTR [r10+rbx]
mov ecx, r10d mov ecx, r10d
mov eax, r10d mov eax, r10d
@ -65,7 +65,7 @@ rwz_main_loop:
aesenc xmm6, xmm7 aesenc xmm6, xmm7
movd rbp, xmm6 movd rbp, xmm6
mov r9, rbp mov r9, rbp
and r9d, 2097136 and r9d, ${MASK}
movdqu xmm0, XMMWORD PTR [rcx+rbx] movdqu xmm0, XMMWORD PTR [rcx+rbx]
movdqu xmm1, XMMWORD PTR [rax+rbx] movdqu xmm1, XMMWORD PTR [rax+rbx]
movdqu xmm2, XMMWORD PTR [r10+rbx] movdqu xmm2, XMMWORD PTR [r10+rbx]
@ -109,9 +109,9 @@ rwz_main_loop:
psubq xmm3, XMMWORD PTR [rsp+16] psubq xmm3, XMMWORD PTR [rsp+16]
movd rdx, xmm3 movd rdx, xmm3
test edx, 524287 test edx, 524287
je rwz_sqrt_fixup je rwz_sqrt_fixup_${ALGO}
psrlq xmm3, 19 psrlq xmm3, 19
rwz_sqrt_fixup_ret: rwz_sqrt_fixup_${ALGO}_ret:
mov ecx, r10d mov ecx, r10d
mov rax, rdi mov rax, rdi
@ -122,7 +122,7 @@ rwz_sqrt_fixup_ret:
mov QWORD PTR [r14], r8 mov QWORD PTR [r14], r8
xor r8, rdi xor r8, rdi
mov edi, r8d mov edi, r8d
and edi, 2097136 and edi, ${MASK}
movd xmm0, rax movd xmm0, rax
xor rax, [rcx+rbx+8] xor rax, [rcx+rbx+8]
add r11, rax add r11, rax
@ -147,7 +147,7 @@ rwz_sqrt_fixup_ret:
mov r10d, edi mov r10d, edi
xor r11, r12 xor r11, r12
dec rsi dec rsi
jne rwz_main_loop jne rwz_main_loop_${ALGO}
ldmxcsr DWORD PTR [rsp] ldmxcsr DWORD PTR [rsp]
mov rbx, QWORD PTR [rsp+160] mov rbx, QWORD PTR [rsp+160]
@ -162,9 +162,9 @@ rwz_sqrt_fixup_ret:
pop rdi pop rdi
pop rsi pop rsi
pop rbp pop rbp
jmp cnv2_rwz_main_loop_endp jmp cnv2_rwz_main_loop_${ALGO}_endp
rwz_sqrt_fixup: rwz_sqrt_fixup_${ALGO}:
dec rdx dec rdx
mov r13d, -1022 mov r13d, -1022
shl r13, 32 shl r13, 32
@ -181,6 +181,6 @@ rwz_sqrt_fixup:
sub rcx, r9 sub rcx, r9
adc rdx, 0 adc rdx, 0
movd xmm3, rdx movd xmm3, rdx
jmp rwz_sqrt_fixup_ret jmp rwz_sqrt_fixup_${ALGO}_ret
cnv2_rwz_main_loop_endp: cnv2_rwz_main_loop_${ALGO}_endp:

View file

@ -143,6 +143,10 @@ PowVariant Job::powVariant() const
return PowVariant::POW_TURTLE; return PowVariant::POW_TURTLE;
} }
if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_EXTREMELITE) {
return PowVariant::POW_UPX2;
}
if (m_powVariant == PowVariant::POW_AUTODETECT) { if (m_powVariant == PowVariant::POW_AUTODETECT) {
if (m_blob[0] >= 10) { if (m_blob[0] >= 10) {
return PowVariant::POW_V4; return PowVariant::POW_V4;

View file

@ -62,16 +62,20 @@ DonateStrategy::DonateStrategy(const char *agent, IStrategyListener *listener) :
url = new Url("donate2.graef.in", 1080, userId, nullptr, true, false, true); url = new Url("donate2.graef.in", 1080, userId, nullptr, true, false, true);
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) { } else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
url = new Url("donate2.graef.in", 8090, userId, nullptr, true, false, true); url = new Url("donate2.graef.in", 8090, userId, nullptr, true, false, true);
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_EXTREMELITE) {
url = new Url("donate2.graef.in", 9091, userId, nullptr, true, false, true);
} else { } else {
url = new Url("donate2.graef.in", 443, userId, nullptr, true, false, true); url = new Url("donate2.graef.in", 443, userId, nullptr, true, false, true);
} }
#else #else
if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_HEAVY) { if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_HEAVY) {
url = new Url("donate.graef.in", 8443, userId, nullptr, false, false, true); url = new Url("donate2.graef.in", 9000, userId, nullptr, false, false, true);
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) { } else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_LITE) {
url = new Url("donate.graef.in", 1080, userId, nullptr, false, false, true); url = new Url("donate2.graef.in", 7000, userId, nullptr, false, false, true);
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) { } else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_ULTRALITE) {
url = new Url("donate2.graef.in", 8088, userId, nullptr, false, false, true); url = new Url("donate2.graef.in", 8088, userId, nullptr, false, false, true);
} else if (Options::i()->algo() == Options::ALGO_CRYPTONIGHT_EXTREMELITE) {
url = new Url("donate2.graef.in", 8088, userId, nullptr, false, false, true);
} else { } else {
url = new Url("donate2.graef.in", 80, userId, nullptr, false, false, true); url = new Url("donate2.graef.in", 80, userId, nullptr, false, false, true);
} }

View file

@ -36,14 +36,14 @@
#define APP_DESC "XMRigCC CPU miner" #define APP_DESC "XMRigCC CPU miner"
#define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id" #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id"
#endif #endif
#define APP_VERSION "1.9.1 (based on XMRig)" #define APP_VERSION "1.9.2 (based on XMRig)"
#define APP_DOMAIN "" #define APP_DOMAIN ""
#define APP_SITE "https://github.com/Bendr0id/xmrigCC" #define APP_SITE "https://github.com/Bendr0id/xmrigCC"
#define APP_KIND "cpu" #define APP_KIND "cpu"
#define APP_VER_MAJOR 1 #define APP_VER_MAJOR 1
#define APP_VER_MINOR 9 #define APP_VER_MINOR 9
#define APP_VER_BUILD 1 #define APP_VER_BUILD 2
#define APP_VER_REV 0 #define APP_VER_REV 0
#ifndef NDEBUG #ifndef NDEBUG