From af6262116977776bedf408f9f22309b32efc252c Mon Sep 17 00:00:00 2001 From: XMRig Date: Tue, 2 Oct 2018 23:58:53 +0300 Subject: [PATCH 01/16] Fix CURL detection. --- CMakeLists.txt | 8 ++++---- README.md | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 9bc81bff..e0a23b29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,9 +91,9 @@ endif() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -maes -Wno-pointer-to-int-cast") if (CMAKE_C_COMPILER_ID MATCHES "Clang") - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -fmerge-all-constants") else() - set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -Ofast -s -funroll-loops -fvariable-expansion-in-unroller -ftree-loop-if-convert-stores -fmerge-all-constants -fbranch-target-load-optimize2") endif() #set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} -gdwarf-2") @@ -140,9 +140,9 @@ endif() if (CMAKE_SIZEOF_VOID_P EQUAL 8) add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) - target_link_libraries(xmrig jansson curl ${CPUID_LIB} ${EXTRA_LIBS}) + target_link_libraries(xmrig jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) else() add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) - target_link_libraries(xmrig32 jansson curl ${CPUID_LIB} ${EXTRA_LIBS}) + target_link_libraries(xmrig32 jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) endif() diff --git a/README.md b/README.md index fb3f44e2..d4b59ceb 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,7 @@ Configure options for libcurl: ``` CMake options: ``` -cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\\curl-7.53.1\include" -DCURL_LIBRARY="c:\\curl-7.53.1\lib\.libs" +cmake .. -G "Unix Makefiles" -DCMAKE_BUILD_TYPE=Release -DCURL_INCLUDE_DIR="c:\xmrig-deps\gcc\x64\include" -DCURL_LIBRARY="c:\xmrig-deps\gcc\x64\lib\libcurl.a" ``` ### Optional features From 0b4b07fcd6694263847c5bfdafa0eb19fc36247f Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 3 Oct 2018 00:39:45 +0300 Subject: [PATCH 02/16] v0.9.0-dev --- version.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/version.h b/version.h index e98da069..c99cea7e 100644 --- a/version.h +++ b/version.h @@ -21,20 +21,20 @@ * along with this program. If not, see . */ -#ifndef __VERSION_H__ -#define __VERSION_H__ +#ifndef XMRIG_VERSION_H +#define XMRIG_VERSION_H #define APP_ID "xmrig" #define APP_NAME "XMRig" #define APP_DESC "Monero (XMR) CPU miner" -#define APP_VERSION "0.8.3" +#define APP_VERSION "0.9.0-dev" #define APP_DOMAIN "xmrig.com" #define APP_SITE "www.xmrig.com" #define APP_COPYRIGHT "Copyright (C) 2016-2018 xmrig.com" #define APP_VER_MAJOR 0 -#define APP_VER_MINOR 8 -#define APP_VER_BUILD 3 +#define APP_VER_MINOR 9 +#define APP_VER_BUILD 0 #define APP_VER_REV 0 -#endif /* __VERSION_H__ */ +#endif /* XMRIG_VERSION_H */ From b93e7d9daaafeb5171934a6209a4c2c65df6f5c4 Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 3 Oct 2018 00:41:14 +0300 Subject: [PATCH 03/16] Workaround for xmrig-proxy bug. --- stratum.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/stratum.c b/stratum.c index 4e937251..94650ec7 100644 --- a/stratum.c +++ b/stratum.c @@ -625,6 +625,11 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) { memcpy(&sctx->id, id, strlen(id)); const char *s = json_string_value(json_object_get(res, "status")); + if (!s) { + // Workaround for xmrig-proxy bug https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89 + s = json_string_value(json_object_get(val, "status")); + } + if (!s) { applog(LOG_ERR, "JSON invalid status"); return false; From f0b293f650bcd54620d61306e833c9e2d286dbdf Mon Sep 17 00:00:00 2001 From: XMRig Date: Wed, 3 Oct 2018 01:27:45 +0300 Subject: [PATCH 04/16] Add support for "nicehash" protocol extension. --- algo/cryptonight/cryptonight_test.h | 31 +++++++++++++++-------------- stratum.c | 27 +++++++++++++++++++++++-- 2 files changed, 41 insertions(+), 17 deletions(-) diff --git a/algo/cryptonight/cryptonight_test.h b/algo/cryptonight/cryptonight_test.h index 65200f75..c5ef5037 100644 --- a/algo/cryptonight/cryptonight_test.h +++ b/algo/cryptonight/cryptonight_test.h @@ -6,6 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett + * Copyright 2018 SChernykh * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -22,49 +23,49 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_TEST_H__ -#define __CRYPTONIGHT_TEST_H__ +#ifndef XMRIG_CRYPTONIGHT_TEST_H +#define XMRIG_CRYPTONIGHT_TEST_H const static uint8_t test_input[152] = { - 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, - 0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, - 0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, - 0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46, - 0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02, 0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00, 0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B, 0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62, 0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92, - 0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01 + 0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01, + 0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19, + 0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9, + 0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F, + 0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46, + 0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02 }; const static uint8_t test_output0[64] = { + 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, + 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, - 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, - 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00 }; #ifndef XMRIG_NO_AEON const static uint8_t test_output1[64] = { - 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, - 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, + 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, + 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, }; #endif // Monero v7 const static uint8_t test_output2[64] = { + 0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, + 0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9, 0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D, 0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22, - 0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, - 0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9 }; -#endif /* __CRYPTONIGHT_TEST_H__ */ +#endif /* XMRIG_CRYPTONIGHT_TEST_H */ diff --git a/stratum.c b/stratum.c index 94650ec7..e1090476 100644 --- a/stratum.c +++ b/stratum.c @@ -40,11 +40,12 @@ # include #endif -#include "stratum.h" -#include "version.h" +#include "options.h" #include "stats.h" +#include "stratum.h" #include "util.h" #include "utils/applog.h" +#include "version.h" #ifdef WIN32 @@ -73,6 +74,7 @@ static int sockopt_keepalive_cb(void *userdata, curl_socket_t fd, curlsocktype p static curl_socket_t opensocket_grab_cb(void *clientp, curlsocktype purpose, struct curl_sockaddr *addr); static int closesocket_cb(void *clientp, curl_socket_t item); static bool login_decode(struct stratum_ctx *sctx, const json_t *val); +static void extensions_decode(const json_t *val); static bool job_decode(const json_t *job); static bool jobj_binary(const json_t *obj, const char *key, void *buf, size_t buflen); @@ -640,10 +642,31 @@ static bool login_decode(struct stratum_ctx *sctx, const json_t *val) { return false; } + extensions_decode(res); + return true; } +static void extensions_decode(const json_t *res) +{ + json_t *extensions = json_object_get(res, "extensions"); + if (!extensions || json_array_size(extensions) == 0) { + return; + } + + size_t index; + json_t *value; + + json_array_foreach(extensions, index, value) { + const char *s = json_string_value(value); + if (s && strcmp(s, "nicehash")) { + opt_nicehash = true; + } + } +} + + /** * @brief job_decode * @param sctx From 93d072ff6ebbc7d39d3c81bb6cef3c9756ac4291 Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 15:52:12 +0300 Subject: [PATCH 05/16] Massive refactoring, preparing for cn/2. --- CMakeLists.txt | 16 +- .../cryptonight-lite/cryptonight_lite_aesni.h | 24 +- ...ite_av1_aesni.c => cryptonight_lite_av1.c} | 72 +++++- ..._aesni_double.c => cryptonight_lite_av2.c} | 108 ++++++++- algo/cryptonight-lite/cryptonight_lite_av3.c | 134 +++++++++++ .../cryptonight_lite_av3_softaes.c | 78 ------ ...oftaes_double.c => cryptonight_lite_av4.c} | 108 ++++++++- .../cryptonight_lite_softaes.h | 19 ++ algo/cryptonight/cryptonight.c | 224 +++++++++++------- algo/cryptonight/cryptonight.h | 28 ++- algo/cryptonight/cryptonight_aesni.h | 24 +- ...ptonight_av1_aesni.c => cryptonight_av1.c} | 80 ++++++- ...t_av2_aesni_double.c => cryptonight_av2.c} | 125 ++++++++-- algo/cryptonight/cryptonight_av3.c | 139 +++++++++++ algo/cryptonight/cryptonight_av3_softaes.c | 84 ------- ...av4_softaes_double.c => cryptonight_av4.c} | 124 ++++++++-- algo/cryptonight/cryptonight_monero.h | 114 +++++++-- algo/cryptonight/cryptonight_softaes.h | 25 +- algo/cryptonight/cryptonight_test.h | 45 +++- memory.c | 43 +--- options.c | 35 ++- options.h | 37 ++- persistent_memory.h | 12 +- stratum.h | 7 +- utils/summary.c | 4 +- xmrig.c | 8 +- 26 files changed, 1259 insertions(+), 458 deletions(-) rename algo/cryptonight-lite/{cryptonight_lite_av1_aesni.c => cryptonight_lite_av1.c} (53%) rename algo/cryptonight-lite/{cryptonight_lite_av2_aesni_double.c => cryptonight_lite_av2.c} (50%) create mode 100644 algo/cryptonight-lite/cryptonight_lite_av3.c delete mode 100644 algo/cryptonight-lite/cryptonight_lite_av3_softaes.c rename algo/cryptonight-lite/{cryptonight_lite_av4_softaes_double.c => cryptonight_lite_av4.c} (51%) rename algo/cryptonight/{cryptonight_av1_aesni.c => cryptonight_av1.c} (52%) rename algo/cryptonight/{cryptonight_av2_aesni_double.c => cryptonight_av2.c} (51%) create mode 100644 algo/cryptonight/cryptonight_av3.c delete mode 100644 algo/cryptonight/cryptonight_av3_softaes.c rename algo/cryptonight/{cryptonight_av4_softaes_double.c => cryptonight_av4.c} (51%) diff --git a/CMakeLists.txt b/CMakeLists.txt index e0a23b29..5fa22d7f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,10 +43,10 @@ set(HEADERS_UTILS set(SOURCES xmrig.c algo/cryptonight/cryptonight.c - algo/cryptonight/cryptonight_av1_aesni.c - algo/cryptonight/cryptonight_av2_aesni_double.c - algo/cryptonight/cryptonight_av3_softaes.c - algo/cryptonight/cryptonight_av4_softaes_double.c + algo/cryptonight/cryptonight_av1.c + algo/cryptonight/cryptonight_av2.c + algo/cryptonight/cryptonight_av3.c + algo/cryptonight/cryptonight_av4.c util.c options.c stratum.c @@ -127,10 +127,10 @@ endif() if (WITH_AEON) set(SOURCES_AEON - algo/cryptonight-lite/cryptonight_lite_av1_aesni.c - algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c - algo/cryptonight-lite/cryptonight_lite_av3_softaes.c - algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c + algo/cryptonight-lite/cryptonight_lite_av1.c + algo/cryptonight-lite/cryptonight_lite_av2.c + algo/cryptonight-lite/cryptonight_lite_av3.c + algo/cryptonight-lite/cryptonight_lite_av4.c algo/cryptonight-lite/cryptonight_lite_aesni.h algo/cryptonight-lite/cryptonight_lite_softaes.h ) diff --git a/algo/cryptonight-lite/cryptonight_lite_aesni.h b/algo/cryptonight-lite/cryptonight_lite_aesni.h index bb528cfb..0ac6a135 100644 --- a/algo/cryptonight-lite/cryptonight_lite_aesni.h +++ b/algo/cryptonight-lite/cryptonight_lite_aesni.h @@ -22,10 +22,12 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_LITE_AESNI_H__ -#define __CRYPTONIGHT_LITE_AESNI_H__ +#ifndef XMRIG_CRYPTONIGHT_LITE_AESNI_H +#define XMRIG_CRYPTONIGHT_LITE_AESNI_H + #include +#include #define aes_genkey_sub(imm8) \ @@ -253,4 +255,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint #endif -#endif /* __CRYPTONIGHT_LITE_AESNI_H__ */ +static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) +{ + mem_out[0] = EXTRACT64(tmp); + + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + uint64_t vh = EXTRACT64(tmp); + + uint8_t x = vh >> 24; + static const uint16_t table = 0x7531; + const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; + vh ^= ((table >> index) & 0x3) << 28; + + mem_out[1] = vh; +} + + +#endif /* XMRIG_CRYPTONIGHT_LITE_AESNI_H */ diff --git a/algo/cryptonight-lite/cryptonight_lite_av1_aesni.c b/algo/cryptonight-lite/cryptonight_lite_av1.c similarity index 53% rename from algo/cryptonight-lite/cryptonight_lite_av1_aesni.c rename to algo/cryptonight-lite/cryptonight_lite_av1.c index fb678746..307c256d 100644 --- a/algo/cryptonight-lite/cryptonight_lite_av1_aesni.c +++ b/algo/cryptonight-lite/cryptonight_lite_av1.c @@ -27,18 +27,19 @@ #include #include "algo/cryptonight/cryptonight.h" -#include "cryptonight_lite_aesni.h" +#include "algo/cryptonight/cryptonight_monero.h" #include "crypto/c_keccak.h" +#include "cryptonight_lite_aesni.h" -void cryptonight_lite_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) +void cryptonight_lite_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { - keccak((const uint8_t *) input, size, ctx->state0, 200); + keccak(input, size, ctx[0]->state, 200); - cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); - const uint8_t* l0 = ctx->memory; - uint64_t* h0 = (uint64_t*) ctx->state0; + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; uint64_t al0 = h0[0] ^ h0[4]; uint64_t ah0 = h0[1] ^ h0[5]; @@ -71,8 +72,63 @@ void cryptonight_lite_av1_aesni(const void *restrict input, size_t size, void *r idx0 = al0; } - cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); keccakf(h0, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_lite_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 32); + return; + } + + keccak(input, size, ctx[0]->state, 200); + + VARIANT1_INIT(0); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { + __m128i cx; + cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = EXTRACT64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } diff --git a/algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c b/algo/cryptonight-lite/cryptonight_lite_av2.c similarity index 50% rename from algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c rename to algo/cryptonight-lite/cryptonight_lite_av2.c index 727e804b..31b85d8d 100644 --- a/algo/cryptonight-lite/cryptonight_lite_av2_aesni_double.c +++ b/algo/cryptonight-lite/cryptonight_lite_av2.c @@ -27,19 +27,20 @@ #include #include "algo/cryptonight/cryptonight.h" +#include "algo/cryptonight/cryptonight_monero.h" #include "cryptonight_lite_aesni.h" #include "crypto/c_keccak.h" -void cryptonight_lite_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) +void cryptonight_lite_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { - keccak((const uint8_t *) input, size, ctx->state0, 200); - keccak((const uint8_t *) input + size, size, ctx->state1, 200); + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEMORY_LITE; - uint64_t* h0 = (uint64_t*) ctx->state0; - uint64_t* h1 = (uint64_t*) ctx->state1; + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -107,6 +108,95 @@ void cryptonight_lite_av2_aesni_double(const void *restrict input, size_t size, keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); - extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32); +} + + +void cryptonight_lite_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 64); + return; + } + + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + VARIANT1_INIT(0); + VARIANT1_INIT(1); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + uint64_t idx1 = h1[0] ^ h1[4]; + + for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); + cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; + ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; + lo = _umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; + ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32); } diff --git a/algo/cryptonight-lite/cryptonight_lite_av3.c b/algo/cryptonight-lite/cryptonight_lite_av3.c new file mode 100644 index 00000000..b0d5d368 --- /dev/null +++ b/algo/cryptonight-lite/cryptonight_lite_av3.c @@ -0,0 +1,134 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017 fireice-uk + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2016-2018 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "algo/cryptonight/cryptonight.h" +#include "algo/cryptonight/cryptonight_monero.h" +#include "cryptonight_lite_softaes.h" +#include "crypto/c_keccak.h" + + +void cryptonight_lite_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { + __m128i cx; + cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); + cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); + + _mm_store_si128((__m128i *) &l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); + idx0 = EXTRACT64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_lite_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 32); + return; + } + + keccak(input, size, ctx[0]->state, 200); + + VARIANT1_INIT(0); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { + __m128i cx; + cx = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); + cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = EXTRACT64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} diff --git a/algo/cryptonight-lite/cryptonight_lite_av3_softaes.c b/algo/cryptonight-lite/cryptonight_lite_av3_softaes.c deleted file mode 100644 index a5a36fbb..00000000 --- a/algo/cryptonight-lite/cryptonight_lite_av3_softaes.c +++ /dev/null @@ -1,78 +0,0 @@ -/* XMRig - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * Copyright 2014 Lucas Jones - * Copyright 2014-2016 Wolf9466 - * Copyright 2016 Jay D Dee - * Copyright 2017 fireice-uk - * Copyright 2017-2018 XMR-Stak , - * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include - -#include "algo/cryptonight/cryptonight.h" -#include "cryptonight_lite_softaes.h" -#include "crypto/c_keccak.h" - - -void cryptonight_lite_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) -{ - keccak((const uint8_t *) input, size, ctx->state0, 200); - - cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); - - const uint8_t* l0 = ctx->memory; - uint64_t* h0 = (uint64_t*) ctx->state0; - - uint64_t al0 = h0[0] ^ h0[4]; - uint64_t ah0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - - for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & 0xFFFF0]); - cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); - - _mm_store_si128((__m128i *)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx)); - idx0 = EXTRACT64(cx); - bx0 = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & 0xFFFF0])[0]; - ch = ((uint64_t*)&l0[idx0 & 0xFFFF0])[1]; - lo = _umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ((uint64_t*)&l0[idx0 & 0xFFFF0])[0] = al0; - ((uint64_t*)&l0[idx0 & 0xFFFF0])[1] = ah0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - } - - cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); - - keccakf(h0, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); -} diff --git a/algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c b/algo/cryptonight-lite/cryptonight_lite_av4.c similarity index 51% rename from algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c rename to algo/cryptonight-lite/cryptonight_lite_av4.c index cdf8ff5d..4a386642 100644 --- a/algo/cryptonight-lite/cryptonight_lite_av4_softaes_double.c +++ b/algo/cryptonight-lite/cryptonight_lite_av4.c @@ -27,19 +27,20 @@ #include #include "algo/cryptonight/cryptonight.h" +#include "algo/cryptonight/cryptonight_monero.h" #include "cryptonight_lite_softaes.h" #include "crypto/c_keccak.h" -void cryptonight_lite_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) +void cryptonight_lite_av4_v0(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx) { - keccak((const uint8_t *) input, size, ctx->state0, 200); - keccak((const uint8_t *) input + size, size, ctx->state1, 200); + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEMORY_LITE; - uint64_t* h0 = (uint64_t*) ctx->state0; - uint64_t* h1 = (uint64_t*) ctx->state1; + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -107,6 +108,95 @@ void cryptonight_lite_av4_softaes_double(const void *restrict input, size_t size keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); - extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + +void cryptonight_lite_av4_v1(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 64); + return; + } + + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + VARIANT1_INIT(0); + VARIANT1_INIT(1); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + uint64_t idx1 = h1[0] ^ h1[4]; + + for (size_t i = 0; __builtin_expect(i < 0x40000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0xFFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0xFFFF0]); + + cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0xFFFF0], _mm_xor_si128(bx0, cx0)); + cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0xFFFF0], _mm_xor_si128(bx1, cx1)); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0xFFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0xFFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & 0xFFFF0])[0] = al0; + ((uint64_t*) &l0[idx0 & 0xFFFF0])[1] = ah0 ^ tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0xFFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0xFFFF0])[1]; + lo = _umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & 0xFFFF0])[0] = al1; + ((uint64_t*) &l1[idx1 & 0xFFFF0])[1] = ah1 ^ tweak1_2_1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, (char*) output + 32); } diff --git a/algo/cryptonight-lite/cryptonight_lite_softaes.h b/algo/cryptonight-lite/cryptonight_lite_softaes.h index bab3dcaf..1e06a0f2 100644 --- a/algo/cryptonight-lite/cryptonight_lite_softaes.h +++ b/algo/cryptonight-lite/cryptonight_lite_softaes.h @@ -25,7 +25,10 @@ #ifndef __CRYPTONIGHT_LITE_SOFTAES_H__ #define __CRYPTONIGHT_LITE_SOFTAES_H__ + #include +#include + extern __m128i soft_aesenc(__m128i in, __m128i key); extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon); @@ -234,4 +237,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint #endif +static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) +{ + mem_out[0] = EXTRACT64(tmp); + + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + uint64_t vh = EXTRACT64(tmp); + + uint8_t x = vh >> 24; + static const uint16_t table = 0x7531; + const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; + vh ^= ((table >> index) & 0x3) << 28; + + mem_out[1] = vh; +} + + #endif /* __CRYPTONIGHT_LITE_SOFTAES_H__ */ diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c index fb981df2..cd91f9a8 100644 --- a/algo/cryptonight/cryptonight.c +++ b/algo/cryptonight/cryptonight.c @@ -23,10 +23,12 @@ */ +#include #include #include #include + #ifndef BUILD_TEST # include "xmrig.h" #endif @@ -39,113 +41,136 @@ #include "cryptonight_test.h" #include "options.h" +#include "utils/applog.h" + + +void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av1_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av2_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av3_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_av4_v2(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); -void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version); -void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version); -void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version); -void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version); #ifndef XMRIG_NO_AEON -void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t); -void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t); -void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t); -void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t); +void cryptonight_lite_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_lite_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_lite_av2_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_lite_av2_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_lite_av3_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_lite_av3_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_lite_av4_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); #endif void (*cryptonight_hash_ctx)(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version) = NULL; -static bool self_test() { - if (cryptonight_hash_ctx == NULL) { +static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue) +{ + cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant); + if (func == NULL) { return false; } - char output[64]; + func(test_input, 76, output, ctx); - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16); - ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 2, 16); - - cryptonight_hash_ctx(test_input, 76, output, ctx, 0); - -# ifndef XMRIG_NO_AEON - bool rc = memcmp(output, opt_algo == ALGO_CRYPTONIGHT_LITE ? test_output1 : test_output0, (opt_double_hash ? 64 : 32)) == 0; -# else - bool rc = memcmp(output, test_output0, opt_double_hash ? 64 : 32)) == 0; -# endif - - if (rc && opt_algo == ALGO_CRYPTONIGHT) { - cryptonight_hash_ctx(test_input, 76, output, ctx, 7); - - rc = memcmp(output, test_output2, (opt_double_hash ? 64 : 32)) == 0; - } - - _mm_free(ctx->memory); - _mm_free(ctx); - - return rc; + return memcmp(output, referenceValue, opt_double_hash ? 64 : 32) == 0; } -#ifndef XMRIG_NO_AEON -bool cryptonight_lite_init(int variant) { - switch (variant) { - case AEON_AV1_AESNI: - cryptonight_hash_ctx = cryptonight_lite_av1_aesni; - break; +static bool self_test() { + struct cryptonight_ctx *ctx[2]; + uint8_t output[64]; - case AEON_AV2_AESNI_DOUBLE: - opt_double_hash = true; - cryptonight_hash_ctx = cryptonight_lite_av2_aesni_double; - break; + const size_t count = opt_double_hash ? 2 : 1; + const size_t size = opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE; + bool result = false; - case AEON_AV3_SOFT_AES: - cryptonight_hash_ctx = cryptonight_lite_av3_softaes; - break; - - case AEON_AV4_SOFT_AES_DOUBLE: - opt_double_hash = true; - cryptonight_hash_ctx = cryptonight_lite_av4_softaes_double; - break; - - default: - break; + for (int i = 0; i < count; ++i) { + ctx[i] = _mm_malloc(sizeof(struct cryptonight_ctx), 16); + ctx[i]->memory = _mm_malloc(size, 16); } - return self_test(); + if (opt_algo == ALGO_CRYPTONIGHT) { + result = verify(VARIANT_0, output, ctx, test_output_v0) && + verify(VARIANT_1, output, ctx, test_output_v1) && + verify(VARIANT_0, output, ctx, test_output_v0); + } + else { + result = verify(VARIANT_0, output, ctx, test_output_v0_lite) && + verify(VARIANT_1, output, ctx, test_output_v1_lite); + } + + + for (int i = 0; i < count; ++i) { + _mm_free(ctx[i]->memory); + _mm_free(ctx[i]); + } + + return result; } -#endif -bool cryptonight_init(int variant) +cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant) { -# ifndef XMRIG_NO_AEON - if (opt_algo == ALGO_CRYPTONIGHT_LITE) { - return cryptonight_lite_init(variant); - } + assert(av > AV_AUTO && av < AV_MAX); + assert(variant > VARIANT_AUTO && variant < VARIANT_MAX); + + static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = { + cryptonight_av1_v0, + cryptonight_av2_v0, + cryptonight_av3_v0, + cryptonight_av4_v0, + cryptonight_av1_v1, + cryptonight_av2_v1, + cryptonight_av3_v1, + cryptonight_av4_v1, + cryptonight_av1_v2, + cryptonight_av2_v2, + cryptonight_av3_v2, + cryptonight_av4_v2, + +# ifndef XMRIG_NO_AEON + cryptonight_lite_av1_v0, + cryptonight_lite_av2_v0, + cryptonight_lite_av3_v0, + cryptonight_lite_av4_v0, + cryptonight_lite_av1_v1, + cryptonight_lite_av2_v1, + cryptonight_lite_av3_v1, + cryptonight_lite_av4_v1, + NULL, + NULL, + NULL, + NULL +# endif + }; + + const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1; + +# ifndef NDEBUG + cn_hash_fun func = func_table[index]; + + assert(index < sizeof(func_table) / sizeof(func_table[0])); + assert(func != NULL); + + return func; +# else + return func_table[index]; # endif +} - switch (variant) { - case XMR_AV1_AESNI: - cryptonight_hash_ctx = cryptonight_av1_aesni; - break; - case XMR_AV2_AESNI_DOUBLE: - opt_double_hash = true; - cryptonight_hash_ctx = cryptonight_av2_aesni_double; - break; - - case XMR_AV3_SOFT_AES: - cryptonight_hash_ctx = cryptonight_av3_softaes; - break; - - case XMR_AV4_SOFT_AES_DOUBLE: - opt_double_hash = true; - cryptonight_hash_ctx = cryptonight_av4_softaes_double; - break; - - default: - break; - } +bool cryptonight_init(int av) +{ + opt_double_hash = av == AV_DOUBLE || av == AV_DOUBLE_SOFT; return self_test(); } @@ -174,12 +199,32 @@ static inline void do_skein_hash(const void* input, size_t len, char* output) { void (* const extra_hashes[4])(const void *, size_t, char *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; +static inline enum Variant cryptonight_variant(uint8_t version) +{ + if (opt_variant != VARIANT_AUTO) { + return opt_variant; + } + + if (opt_algo == ALGO_CRYPTONIGHT_LITE) { + return VARIANT_1; + } + + if (version >= 8) { + return VARIANT_2; + } + + return version == 7 ? VARIANT_1 : VARIANT_0; +} + + #ifndef BUILD_TEST -int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx) { - uint32_t *nonceptr = (uint32_t*) (((char*) blob) + 39); +int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) { + uint32_t *nonceptr = (uint32_t*) (((char*) blob) + 39); + enum Variant variant = cryptonight_variant(blob[0]); do { - cryptonight_hash_ctx(blob, blob_size, hash, ctx, ((uint8_t*) blob)[0]); + cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx); + (*hashes_done)++; if (unlikely(hash[7] < target)) { @@ -193,13 +238,14 @@ int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, si } -int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx) { - int rc = 0; - uint32_t *nonceptr0 = (uint32_t*) (((char*) blob) + 39); - uint32_t *nonceptr1 = (uint32_t*) (((char*) blob) + 39 + blob_size); +int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx) { + int rc = 0; + uint32_t *nonceptr0 = (uint32_t*) (((char*) blob) + 39); + uint32_t *nonceptr1 = (uint32_t*) (((char*) blob) + 39 + blob_size); + enum Variant variant = cryptonight_variant(blob[0]); do { - cryptonight_hash_ctx(blob, blob_size, hash, ctx, ((uint8_t*) blob)[0]); + cryptonight_hash_fn(opt_algo, opt_av, variant)(blob, blob_size, (uint8_t *) hash, ctx); (*hashes_done) += 2; if (unlikely(hash[7] < target)) { diff --git a/algo/cryptonight/cryptonight.h b/algo/cryptonight/cryptonight.h index f8002afe..74646ef5 100644 --- a/algo/cryptonight/cryptonight.h +++ b/algo/cryptonight/cryptonight.h @@ -22,27 +22,37 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_H__ -#define __CRYPTONIGHT_H__ +#ifndef XMRIG_CRYPTONIGHT_H +#define XMRIG_CRYPTONIGHT_H + #include #include #include + +#include "options.h" + + #define MEMORY 2097152 /* 2 MiB */ #define MEMORY_LITE 1048576 /* 1 MiB */ + struct cryptonight_ctx { - uint8_t state0[200] __attribute__((aligned(16))); - uint8_t state1[200] __attribute__((aligned(16))); - uint8_t* memory __attribute__((aligned(16))); + uint8_t state[224] __attribute__((aligned(16))); + uint8_t* memory __attribute__((aligned(16))); }; +typedef void (*cn_hash_fun)(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); + + extern void (* const extra_hashes[4])(const void *, size_t, char *); -bool cryptonight_init(int variant); -int scanhash_cryptonight(int thr_id, uint32_t *hash, uint32_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx); -int scanhash_cryptonight_double(int thr_id, uint32_t *hash, uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx *restrict ctx); +cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant); -#endif /* __CRYPTONIGHT_H__ */ +bool cryptonight_init(int av); +int scanhash_cryptonight(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx); +int scanhash_cryptonight_double(int thr_id, uint32_t *hash, const uint8_t *restrict blob, size_t blob_size, uint32_t target, uint32_t max_nonce, unsigned long *restrict hashes_done, struct cryptonight_ctx **restrict ctx); + +#endif /* XMRIG_CRYPTONIGHT_H */ diff --git a/algo/cryptonight/cryptonight_aesni.h b/algo/cryptonight/cryptonight_aesni.h index e4d6d42f..b6042897 100644 --- a/algo/cryptonight/cryptonight_aesni.h +++ b/algo/cryptonight/cryptonight_aesni.h @@ -22,10 +22,12 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_AESNI_H__ -#define __CRYPTONIGHT_AESNI_H__ +#ifndef XMRIG_CRYPTONIGHT_AESNI_H +#define XMRIG_CRYPTONIGHT_AESNI_H + #include +#include #define aes_genkey_sub(imm8) \ @@ -253,4 +255,20 @@ static inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint #endif -#endif /* __CRYPTONIGHT_AESNI_H__ */ +static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) +{ + mem_out[0] = EXTRACT64(tmp); + + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + uint64_t vh = EXTRACT64(tmp); + + uint8_t x = vh >> 24; + static const uint16_t table = 0x7531; + const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; + vh ^= ((table >> index) & 0x3) << 28; + + mem_out[1] = vh; +} + + +#endif /* XMRIG_CRYPTONIGHT_AESNI_H */ diff --git a/algo/cryptonight/cryptonight_av1_aesni.c b/algo/cryptonight/cryptonight_av1.c similarity index 52% rename from algo/cryptonight/cryptonight_av1_aesni.c rename to algo/cryptonight/cryptonight_av1.c index b2c45c70..4028dd5d 100644 --- a/algo/cryptonight/cryptonight_av1_aesni.c +++ b/algo/cryptonight/cryptonight_av1.c @@ -32,16 +32,14 @@ #include "cryptonight_monero.h" -void cryptonight_av1_aesni(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) +void cryptonight_av1_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { - keccak((const uint8_t *) input, size, ctx->state0, 200); + keccak(input, size, ctx[0]->state, 200); - VARIANT1_INIT(0); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); - cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); - - const uint8_t* l0 = ctx->memory; - uint64_t* h0 = (uint64_t*) ctx->state0; + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; uint64_t al0 = h0[0] ^ h0[4]; uint64_t ah0 = h0[1] ^ h0[5]; @@ -55,7 +53,6 @@ void cryptonight_av1_aesni(const void *restrict input, size_t size, void *restri cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); - VARIANT1_1(&l0[idx0 & 0x1FFFF0]); idx0 = EXTRACT64(cx); bx0 = cx; @@ -67,18 +64,77 @@ void cryptonight_av1_aesni(const void *restrict input, size_t size, void *restri al0 += hi; ah0 += lo; - VARIANT1_2(ah0, 0); ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; - VARIANT1_2(ah0, 0); ah0 ^= ch; al0 ^= cl; idx0 = al0; } - cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); keccakf(h0, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 32); + return; + } + + keccak(input, size, ctx[0]->state, 200); + + VARIANT1_INIT(0); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx; + cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah0, al0)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = EXTRACT64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + } diff --git a/algo/cryptonight/cryptonight_av2_aesni_double.c b/algo/cryptonight/cryptonight_av2.c similarity index 51% rename from algo/cryptonight/cryptonight_av2_aesni_double.c rename to algo/cryptonight/cryptonight_av2.c index 345207f7..7e5f4109 100644 --- a/algo/cryptonight/cryptonight_av2_aesni_double.c +++ b/algo/cryptonight/cryptonight_av2.c @@ -32,18 +32,15 @@ #include "cryptonight_monero.h" -void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) +void cryptonight_av2_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { - keccak((const uint8_t *) input, size, ctx->state0, 200); - keccak((const uint8_t *) input + size, size, ctx->state1, 200); + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); - VARIANT1_INIT(0); - VARIANT1_INIT(1); - - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEMORY; - uint64_t* h0 = (uint64_t*) ctx->state0; - uint64_t* h1 = (uint64_t*) ctx->state1; + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -69,8 +66,94 @@ void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); - VARIANT1_1(&l0[idx0 & 0x1FFFF0]); - VARIANT1_1(&l1[idx1 & 0x1FFFF0]); + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + lo = _umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + +void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 64); + return; + } + + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + VARIANT1_INIT(0); + VARIANT1_INIT(1); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + uint64_t idx1 = h1[0] ^ h1[4]; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); + cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); @@ -86,10 +169,8 @@ void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void al0 += hi; ah0 += lo; - VARIANT1_2(ah0, 0); ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; - ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; - VARIANT1_2(ah0, 0); + ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; ah0 ^= ch; al0 ^= cl; @@ -102,10 +183,8 @@ void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void al1 += hi; ah1 += lo; - VARIANT1_2(ah1, 1); ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; - ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; - VARIANT1_2(ah1, 1); + ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1; ah1 ^= ch; al1 ^= cl; @@ -118,6 +197,12 @@ void cryptonight_av2_aesni_double(const void *restrict input, size_t size, void keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); - extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + +void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + } diff --git a/algo/cryptonight/cryptonight_av3.c b/algo/cryptonight/cryptonight_av3.c new file mode 100644 index 00000000..a70197ce --- /dev/null +++ b/algo/cryptonight/cryptonight_av3.c @@ -0,0 +1,139 @@ +/* XMRig + * Copyright 2010 Jeff Garzik + * Copyright 2012-2014 pooler + * Copyright 2014 Lucas Jones + * Copyright 2014-2016 Wolf9466 + * Copyright 2016 Jay D Dee + * Copyright 2017 fireice-uk + * Copyright 2017-2018 XMR-Stak , + * Copyright 2018 Lee Clagett + * Copyright 2016-2018 XMRig , + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +#include "crypto/c_keccak.h" +#include "cryptonight.h" +#include "cryptonight_monero.h" +#include "cryptonight_softaes.h" + + +void cryptonight_av3_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx; + cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); + + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + idx0 = EXTRACT64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 32); + return; + } + + keccak(input, size, ctx[0]->state, 200); + + VARIANT1_INIT(0); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx; + cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = EXTRACT64(cx); + bx0 = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ +} diff --git a/algo/cryptonight/cryptonight_av3_softaes.c b/algo/cryptonight/cryptonight_av3_softaes.c deleted file mode 100644 index 1d9f654a..00000000 --- a/algo/cryptonight/cryptonight_av3_softaes.c +++ /dev/null @@ -1,84 +0,0 @@ -/* XMRig - * Copyright 2010 Jeff Garzik - * Copyright 2012-2014 pooler - * Copyright 2014 Lucas Jones - * Copyright 2014-2016 Wolf9466 - * Copyright 2016 Jay D Dee - * Copyright 2017 fireice-uk - * Copyright 2017-2018 XMR-Stak , - * Copyright 2018 Lee Clagett - * Copyright 2016-2018 XMRig , - * - * This program is free software: you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see . - */ - -#include -#include - -#include "crypto/c_keccak.h" -#include "cryptonight.h" -#include "cryptonight_monero.h" -#include "cryptonight_softaes.h" - - -void cryptonight_av3_softaes(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) -{ - keccak((const uint8_t *) input, size, ctx->state0, 200); - - VARIANT1_INIT(0); - - cn_explode_scratchpad((__m128i*) ctx->state0, (__m128i*) ctx->memory); - - const uint8_t* l0 = ctx->memory; - uint64_t* h0 = (uint64_t*) ctx->state0; - - uint64_t al0 = h0[0] ^ h0[4]; - uint64_t ah0 = h0[1] ^ h0[5]; - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - - uint64_t idx0 = h0[0] ^ h0[4]; - - for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { - __m128i cx; - cx = _mm_load_si128((__m128i *)&l0[idx0 & 0x1FFFF0]); - cx = soft_aesenc(cx, _mm_set_epi64x(ah0, al0)); - - _mm_store_si128((__m128i *)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); - VARIANT1_1(&l0[idx0 & 0x1FFFF0]); - idx0 = EXTRACT64(cx); - bx0 = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0]; - ch = ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1]; - lo = _umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - VARIANT1_2(ah0, 0); - ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; - ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; - VARIANT1_2(ah0, 0); - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - } - - cn_implode_scratchpad((__m128i*) ctx->memory, (__m128i*) ctx->state0); - - keccakf(h0, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); -} diff --git a/algo/cryptonight/cryptonight_av4_softaes_double.c b/algo/cryptonight/cryptonight_av4.c similarity index 51% rename from algo/cryptonight/cryptonight_av4_softaes_double.c rename to algo/cryptonight/cryptonight_av4.c index 4085429d..bb484095 100644 --- a/algo/cryptonight/cryptonight_av4_softaes_double.c +++ b/algo/cryptonight/cryptonight_av4.c @@ -32,18 +32,15 @@ #include "cryptonight_softaes.h" -void cryptonight_av4_softaes_double(const void *restrict input, size_t size, void *restrict output, struct cryptonight_ctx *restrict ctx, uint8_t version) +void cryptonight_av4_v0(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { - keccak((const uint8_t *) input, size, ctx->state0, 200); - keccak((const uint8_t *) input + size, size, ctx->state1, 200); + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); - VARIANT1_INIT(0); - VARIANT1_INIT(1); - - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEMORY; - uint64_t* h0 = (uint64_t*) ctx->state0; - uint64_t* h1 = (uint64_t*) ctx->state1; + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -69,8 +66,94 @@ void cryptonight_av4_softaes_double(const void *restrict input, size_t size, voi _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); - VARIANT1_1(&l0[idx0 & 0x1FFFF0]); - VARIANT1_1(&l1[idx1 & 0x1FFFF0]); + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + lo = _umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + lo = _umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + +void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + if (size < 43) { + memset(output, 0, 64); + return; + } + + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + VARIANT1_INIT(0); + VARIANT1_INIT(1); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^ h0[4]; + uint64_t idx1 = h1[0] ^ h1[4]; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + cx0 = soft_aesenc(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc(cx1, _mm_set_epi64x(ah1, al1)); + + cryptonight_monero_tweak((uint64_t*)&l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx0)); + cryptonight_monero_tweak((uint64_t*)&l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx1, cx1)); idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); @@ -86,10 +169,8 @@ void cryptonight_av4_softaes_double(const void *restrict input, size_t size, voi al0 += hi; ah0 += lo; - VARIANT1_2(ah0, 0); ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0] = al0; - ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0; - VARIANT1_2(ah0, 0); + ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1] = ah0 ^ tweak1_2_0; ah0 ^= ch; al0 ^= cl; @@ -102,10 +183,8 @@ void cryptonight_av4_softaes_double(const void *restrict input, size_t size, voi al1 += hi; ah1 += lo; - VARIANT1_2(ah1, 1); ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0] = al1; - ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1; - VARIANT1_2(ah1, 1); + ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1] = ah1 ^ tweak1_2_1; ah1 ^= ch; al1 ^= cl; @@ -118,6 +197,11 @@ void cryptonight_av4_softaes_double(const void *restrict input, size_t size, voi keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state0[0] & 3](ctx->state0, 200, output); - extra_hashes[ctx->state1[0] & 3](ctx->state1, 200, (char*) output + 32); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} + + +void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx *restrict ctx) +{ } diff --git a/algo/cryptonight/cryptonight_monero.h b/algo/cryptonight/cryptonight_monero.h index 2a4e7ee1..44ac27b0 100644 --- a/algo/cryptonight/cryptonight_monero.h +++ b/algo/cryptonight/cryptonight_monero.h @@ -6,6 +6,7 @@ * Copyright 2016 Jay D Dee * Copyright 2017-2018 XMR-Stak , * Copyright 2018 Lee Clagett + * Copyright 2018 SChernykh * Copyright 2016-2018 XMRig , * * This program is free software: you can redistribute it and/or modify @@ -22,30 +23,103 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_MONERO_H__ -#define __CRYPTONIGHT_MONERO_H__ +#ifndef XMRIG_CRYPTONIGHT_MONERO_H +#define XMRIG_CRYPTONIGHT_MONERO_H + + +#include +#include -// VARIANT ALTERATIONS #define VARIANT1_INIT(part) \ - uint64_t tweak1_2_##part = 0; \ - if (version > 6) { \ - tweak1_2_##part = (*(const uint64_t*)(((const uint8_t*) input) + 35 + part * size) ^ \ - *((const uint64_t*)(ctx->state##part) + 24)); \ - } + uint64_t tweak1_2_##part = (*(const uint64_t*)(input + 35 + part * size) ^ \ + *((const uint64_t*)(ctx[part]->state) + 24)); \ -#define VARIANT1_1(p) \ - if (version > 6) { \ - const uint8_t tmp = ((const uint8_t*)(p))[11]; \ - static const uint32_t table = 0x75310; \ - const uint8_t index = (((tmp >> 3) & 6) | (tmp & 1)) << 1; \ - ((uint8_t*)(p))[11] = tmp ^ ((table >> index) & 0x30); \ - } +#ifndef XMRIG_ARM +# define VARIANT2_INIT(part) \ + __m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \ + __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]); -#define VARIANT1_2(p, part) \ - if (version > 6) { \ - (p) ^= tweak1_2_##part; \ - } +#ifdef _MSC_VER +# define VARIANT2_SET_ROUNDING_MODE() if (VARIANT == xmrig::VARIANT_2) { _control87(RC_DOWN, MCW_RC); } +#else +# define VARIANT2_SET_ROUNDING_MODE() if (VARIANT == xmrig::VARIANT_2) { fesetround(FE_DOWNWARD); } +#endif +# define VARIANT2_INTEGER_MATH(part, cl, cx) \ + do { \ + const uint64_t sqrt_result = static_cast(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \ + const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \ + cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \ + const uint32_t d = static_cast(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \ + const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + const uint64_t division_result = static_cast(cx_1 / d) + ((cx_1 % d) << 32); \ + division_result_xmm_##part = _mm_cvtsi64_si128(static_cast(division_result)); \ + sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \ + } while (0) -#endif /* __CRYPTONIGHT_MONERO_H__ */ +# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \ + do { \ + const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ + } while (0) + +# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ + do { \ + const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ + const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ + hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \ + lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \ + const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ + _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ + } while (0) + +#else +# define VARIANT2_INIT(part) \ + uint64_t division_result_##part = h##part[12]; \ + uint64_t sqrt_result_##part = h##part[13]; + +# define VARIANT2_INTEGER_MATH(part, cl, cx) \ + do { \ + const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \ + cl ^= division_result_##part ^ (sqrt_result_##part << 32); \ + const uint32_t d = static_cast(cx_0 + (sqrt_result_##part << 1)) | 0x80000001UL; \ + const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ + division_result_##part = static_cast(cx_1 / d) + ((cx_1 % d) << 32); \ + const uint64_t sqrt_input = cx_0 + division_result_##part; \ + sqrt_result_##part = sqrt(sqrt_input + 18446744073709551616.0) * 2.0 - 8589934592.0; \ + const uint64_t s = sqrt_result_##part >> 1; \ + const uint64_t b = sqrt_result_##part & 1; \ + const uint64_t r2 = (uint64_t)(s) * (s + b) + (sqrt_result_##part << 32); \ + sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \ + } while (0) + +# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \ + do { \ + const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))); \ + const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \ + const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ + } while (0) + +# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ + do { \ + const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \ + const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \ + hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \ + lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \ + const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ + vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ + } while (0) +#endif +#endif /* XMRIG_CRYPTONIGHT_MONERO_H */ diff --git a/algo/cryptonight/cryptonight_softaes.h b/algo/cryptonight/cryptonight_softaes.h index f12ab8c6..4e25b768 100644 --- a/algo/cryptonight/cryptonight_softaes.h +++ b/algo/cryptonight/cryptonight_softaes.h @@ -22,10 +22,13 @@ * along with this program. If not, see . */ -#ifndef __CRYPTONIGHT_SOFTAES_H__ -#define __CRYPTONIGHT_SOFTAES_H__ +#ifndef XMRIG_CRYPTONIGHT_SOFTAES_H +#define XMRIG_CRYPTONIGHT_SOFTAES_H + #include +#include + extern __m128i soft_aesenc(__m128i in, __m128i key); extern __m128i soft_aeskeygenassist(__m128i key, uint8_t rcon); @@ -234,4 +237,20 @@ inline uint64_t _umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *p #endif -#endif /* __CRYPTONIGHT_SOFTAES_H__ */ +static inline void cryptonight_monero_tweak(uint64_t* mem_out, __m128i tmp) +{ + mem_out[0] = EXTRACT64(tmp); + + tmp = _mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(tmp), _mm_castsi128_ps(tmp))); + uint64_t vh = EXTRACT64(tmp); + + uint8_t x = vh >> 24; + static const uint16_t table = 0x7531; + const uint8_t index = (((x >> 3) & 6) | (x & 1)) << 1; + vh ^= ((table >> index) & 0x3) << 28; + + mem_out[1] = vh; +} + + +#endif /* XMRIG_CRYPTONIGHT_SOFTAES_H */ diff --git a/algo/cryptonight/cryptonight_test.h b/algo/cryptonight/cryptonight_test.h index c5ef5037..04efe911 100644 --- a/algo/cryptonight/cryptonight_test.h +++ b/algo/cryptonight/cryptonight_test.h @@ -41,31 +41,50 @@ const static uint8_t test_input[152] = { }; -const static uint8_t test_output0[64] = { +const static uint8_t test_output_v0[64] = { 0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7, 0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00, 0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66, - 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F, + 0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F }; +// Cryptonight variant 1 (Monero v7) +const static uint8_t test_output_v1[64] = { + 0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, + 0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9, + 0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D, + 0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22 +}; + + +// Cryptonight variant 2 (Monero v8) +const static uint8_t test_output_v2[64] = { + 0x97, 0x37, 0x82, 0x82, 0xCF, 0x10, 0xE7, 0xAD, 0x03, 0x3F, 0x7B, 0x80, 0x74, 0xC4, 0x0E, 0x14, + 0xD0, 0x6E, 0x7F, 0x60, 0x9D, 0xDD, 0xDA, 0x78, 0x76, 0x80, 0xB5, 0x8C, 0x05, 0xF4, 0x3D, 0x21, + 0x87, 0x1F, 0xCD, 0x68, 0x23, 0xF6, 0xA8, 0x79, 0xBB, 0x3F, 0x33, 0x95, 0x1C, 0x8E, 0x8E, 0x89, + 0x1D, 0x40, 0x43, 0x88, 0x0B, 0x02, 0xDF, 0xA1, 0xBB, 0x3B, 0xE4, 0x98, 0xB5, 0x0E, 0x75, 0x78 +}; + + + #ifndef XMRIG_NO_AEON -const static uint8_t test_output1[64] = { +const static uint8_t test_output_v0_lite[64] = { 0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E, 0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88, 0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE, - 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD, + 0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD +}; + + +// AEON v7 +const static uint8_t test_output_v1_lite[64] = { + 0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22, + 0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41, + 0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45, + 0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F }; #endif -// Monero v7 -const static uint8_t test_output2[64] = { - 0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9, - 0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9, - 0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D, - 0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22, -}; - - #endif /* XMRIG_CRYPTONIGHT_TEST_H */ diff --git a/memory.c b/memory.c index 112f1115..b8a9eb65 100644 --- a/memory.c +++ b/memory.c @@ -24,32 +24,12 @@ #include #include "persistent_memory.h" -#include "algo/cryptonight/cryptonight.h" #include "options.h" + static size_t offset = 0; -#ifndef XMRIG_NO_AEON -static void * create_persistent_ctx_lite(int thr_id) { - struct cryptonight_ctx *ctx = NULL; - - if (!opt_double_hash) { - const size_t offset = MEMORY * (thr_id + 1); - - ctx = (struct cryptonight_ctx *) &persistent_memory[offset + MEMORY_LITE]; - ctx->memory = (uint8_t*) &persistent_memory[offset]; - return ctx; - } - - ctx = (struct cryptonight_ctx *) &persistent_memory[MEMORY - sizeof(struct cryptonight_ctx) * (thr_id + 1)]; - ctx->memory = (uint8_t*) &persistent_memory[MEMORY * (thr_id + 1)]; - - return ctx; -} -#endif - - void * persistent_calloc(size_t num, size_t size) { void *mem = &persistent_memory[offset]; offset += (num * size); @@ -60,17 +40,14 @@ void * persistent_calloc(size_t num, size_t size) { } -void * create_persistent_ctx(int thr_id) { -# ifndef XMRIG_NO_AEON - if (opt_algo == ALGO_CRYPTONIGHT_LITE) { - return create_persistent_ctx_lite(thr_id); +void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id) +{ + const int ratio = (opt_double_hash && opt_algo == ALGO_CRYPTONIGHT) ? 2 : 1; + ctx[0] = persistent_calloc(1, sizeof(struct cryptonight_ctx)); + ctx[0]->memory = &persistent_memory[MEMORY * (thr_id * ratio + 1)]; + + if (opt_double_hash) { + ctx[1] = persistent_calloc(1, sizeof(struct cryptonight_ctx)); + ctx[1]->memory = ctx[0]->memory + (opt_algo == ALGO_CRYPTONIGHT ? MEMORY : MEMORY_LITE); } -# endif - - struct cryptonight_ctx *ctx = (struct cryptonight_ctx *) &persistent_memory[MEMORY - sizeof(struct cryptonight_ctx) * (thr_id + 1)]; - - const int ratio = opt_double_hash ? 2 : 1; - ctx->memory = (uint8_t*) &persistent_memory[MEMORY * (thr_id * ratio + 1)]; - - return ctx; } diff --git a/options.c b/options.c index 5dabefbd..a4cec796 100644 --- a/options.c +++ b/options.c @@ -38,7 +38,6 @@ int64_t opt_affinity = -1L; int opt_n_threads = 0; -int opt_algo_variant = 0; int opt_retries = 5; int opt_retry_pause = 5; int opt_donate_level = DONATE_LEVEL; @@ -55,13 +54,16 @@ char *opt_userpass = NULL; char *opt_user = NULL; char *opt_pass = NULL; -enum mining_algo opt_algo = ALGO_CRYPTONIGHT; +enum Algo opt_algo = ALGO_CRYPTONIGHT; +enum Variant opt_variant = VARIANT_AUTO; +enum AlgoVariant opt_av = AV_AUTO; static char const usage[] = "\ Usage: " APP_ID " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO cryptonight (default) or cryptonight-lite\n\ + --variant=N cryptonight variant: 0-2\n\ -o, --url=URL URL of mining server\n\ -b, --backup-url=URL URL of backup mining server\n\ -O, --userpass=U:P username:password pair for mining server\n\ @@ -110,18 +112,27 @@ static struct option const options[] = { { "user", 1, NULL, 'u' }, { "userpass", 1, NULL, 'O' }, { "version", 0, NULL, 'V' }, - { 0, 0, 0, 0 } + { "variant", 1, NULL, 1021 }, + { NULL, 0, NULL, 0 } }; static const char *algo_names[] = { - [ALGO_CRYPTONIGHT] = "cryptonight", + "cryptonight", # ifndef XMRIG_NO_AEON - [ALGO_CRYPTONIGHT_LITE] = "cryptonight-lite" + "cryptonight-lite" # endif }; +static const char *variant_names[] = { + "auto" + "0", + "1", + "2", +}; + + #ifndef XMRIG_NO_AEON static int get_cryptonight_lite_variant(int variant) { if (variant <= AEON_AV0_AUTO || variant >= AEON_AV_MAX) { @@ -144,11 +155,11 @@ static int get_algo_variant(int algo, int variant) { } # endif - if (variant <= XMR_AV0_AUTO || variant >= XMR_AV_MAX) { - return (cpu_info.flags & CPU_FLAG_AES) ? XMR_AV1_AESNI : XMR_AV3_SOFT_AES; + if (variant <= AV_AUTO || variant >= AV_MAX) { + return (cpu_info.flags & CPU_FLAG_AES) ? AV_SINGLE : AV_SINGLE_SOFT; } - if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= XMR_AV2_AESNI_DOUBLE) { + if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) { return variant + 2; } @@ -300,11 +311,11 @@ static void parse_arg(int key, char *arg) { case 'v': /* --av */ v = atoi(arg); - if (v < 0 || v > 1000) { + if (v <= AV_AUTO || v >= AV_MAX) { show_usage_and_exit(1); } - opt_algo_variant = v; + opt_av = v; break; case 1020: /* --cpu-affinity */ @@ -451,9 +462,9 @@ void parse_cmdline(int argc, char *argv[]) { sprintf(opt_userpass, "%s:%s", opt_user, opt_pass); } - opt_algo_variant = get_algo_variant(opt_algo, opt_algo_variant); + opt_av = get_algo_variant(opt_algo, opt_av); - if (!cryptonight_init(opt_algo_variant)) { + if (!cryptonight_init(opt_av)) { applog(LOG_ERR, "Cryptonight hash self-test failed. This might be caused by bad compiler optimizations."); proper_exit(1); } diff --git a/options.h b/options.h index a14aaeeb..0dffb1cc 100644 --- a/options.h +++ b/options.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __OPTIONS_H__ -#define __OPTIONS_H__ +#ifndef XMRIG_OPTIONS_H +#define XMRIG_OPTIONS_H #include #include @@ -32,19 +32,28 @@ #endif -enum mining_algo { +enum Algo { ALGO_CRYPTONIGHT, /* CryptoNight (Monero) */ ALGO_CRYPTONIGHT_LITE, /* CryptoNight-Lite (AEON) */ }; -enum xmr_algo_variant { - XMR_AV0_AUTO, - XMR_AV1_AESNI, - XMR_AV2_AESNI_DOUBLE, - XMR_AV3_SOFT_AES, - XMR_AV4_SOFT_AES_DOUBLE, - XMR_AV_MAX +enum Variant { + VARIANT_AUTO = -1, + VARIANT_0 = 0, + VARIANT_1 = 1, + VARIANT_2 = 2, + VARIANT_MAX +}; + + +enum AlgoVariant { + AV_AUTO, // --av=0 Automatic mode. + AV_SINGLE, // --av=1 Single hash mode + AV_DOUBLE, // --av=2 Double hash mode + AV_SINGLE_SOFT, // --av=3 Single hash mode (Software AES) + AV_DOUBLE_SOFT, // --av=4 Double hash mode (Software AES) + AV_MAX }; @@ -72,13 +81,15 @@ extern char *opt_userpass; extern char *opt_user; extern char *opt_pass; extern int opt_n_threads; -extern int opt_algo_variant; extern int opt_retry_pause; extern int opt_retries; extern int opt_donate_level; extern int opt_max_cpu_usage; extern int64_t opt_affinity; -extern enum mining_algo opt_algo; + +extern enum Algo opt_algo; +extern enum Variant opt_variant; +extern enum AlgoVariant opt_av; void parse_cmdline(int argc, char *argv[]); void show_usage_and_exit(int status); @@ -88,4 +99,4 @@ const char* get_current_algo_name(void); extern void proper_exit(int reason); -#endif /* __OPTIONS_H__ */ +#endif /* XMRIG_OPTIONS_H */ diff --git a/persistent_memory.h b/persistent_memory.h index 5a6d6ca7..171a86ac 100644 --- a/persistent_memory.h +++ b/persistent_memory.h @@ -21,12 +21,16 @@ * along with this program. If not, see . */ -#ifndef __PERSISTENT_MEMORY_H__ -#define __PERSISTENT_MEMORY_H__ +#ifndef XMRIG_PERSISTENT_MEMORY_H +#define XMRIG_PERSISTENT_MEMORY_H + #include +#include "algo/cryptonight/cryptonight.h" + + enum memory_flags { MEMORY_HUGEPAGES_AVAILABLE = 1, MEMORY_HUGEPAGES_ENABLED = 2, @@ -44,7 +48,7 @@ extern int persistent_memory_flags; const char * persistent_memory_allocate(); void persistent_memory_free(); void * persistent_calloc(size_t num, size_t size); -void * create_persistent_ctx(int thr_id); +void create_cryptonight_ctx(struct cryptonight_ctx **ctx, int thr_id); -#endif /* __PERSISTENT_MEMORY_H__ */ +#endif /* XMRIG_PERSISTENT_MEMORY_H */ diff --git a/stratum.h b/stratum.h index 48369567..00fd426f 100644 --- a/stratum.h +++ b/stratum.h @@ -21,8 +21,9 @@ * along with this program. If not, see . */ -#ifndef __STRATUM_H__ -#define __STRATUM_H__ +#ifndef XMRIG_STRATUM_H +#define XMRIG_STRATUM_H + #include #include @@ -75,4 +76,4 @@ bool stratum_handle_method(struct stratum_ctx *sctx, const char *s); bool stratum_handle_response(char *buf); bool stratum_keepalived(struct stratum_ctx *sctx); -#endif /* __STRATUM_H__ */ +#endif /* XMRIG_STRATUM_H */ diff --git a/utils/summary.c b/utils/summary.c index 65912bb0..85cf0e41 100644 --- a/utils/summary.c +++ b/utils/summary.c @@ -77,10 +77,10 @@ static void print_threads() { } if (opt_colors) { - applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS: " CL_WHT "%d" CL_WHT ", av=%d, %s, donate=%d%%%s", opt_n_threads, opt_algo_variant, get_current_algo_name(), opt_donate_level, extra); + applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS: " CL_WHT "%d" CL_WHT ", av=%d, %s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), opt_donate_level, extra); } else { - applog_notime(LOG_INFO, " * THREADS: %d, av=%d, %s, donate=%d%%%s", opt_n_threads, opt_algo_variant, get_current_algo_name(), opt_donate_level, extra); + applog_notime(LOG_INFO, " * THREADS: %d, av=%d, %s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), opt_donate_level, extra); } } diff --git a/xmrig.c b/xmrig.c index 7b14933b..d79808db 100644 --- a/xmrig.c +++ b/xmrig.c @@ -260,7 +260,8 @@ static void *miner_thread(void *userdata) { uint32_t max_nonce; uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; - struct cryptonight_ctx *persistentctx = (struct cryptonight_ctx *) create_persistent_ctx(thr_id); + struct cryptonight_ctx *persistentctx[1]; + create_cryptonight_ctx(persistentctx, thr_id); if (cpu_info.total_logical_cpus > 1 && opt_affinity != -1L) { affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity); @@ -306,7 +307,7 @@ static void *miner_thread(void *userdata) { gettimeofday(&tv_start, NULL); /* scan nonces for a proof-of-work hash */ - const int rc = scanhash_cryptonight(thr_id, hash, work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx); + const int rc = scanhash_cryptonight(thr_id, hash, (const uint8_t *) work.blob, work.blob_size, work.target, max_nonce, &hashes_done, persistentctx); stats_add_hashes(thr_id, &tv_start, hashes_done); if (!rc) { @@ -335,7 +336,8 @@ static void *miner_thread_double(void *userdata) { uint32_t max_nonce; uint32_t end_nonce = 0xffffffffU / opt_n_threads * (thr_id + 1) - 0x20; - struct cryptonight_ctx *persistentctx = (struct cryptonight_ctx *) create_persistent_ctx(thr_id); + struct cryptonight_ctx *persistentctx[2]; + create_cryptonight_ctx(persistentctx, thr_id); if (cpu_info.total_logical_cpus > 1 && opt_affinity != -1L) { affine_to_cpu_mask(thr_id, (unsigned long) opt_affinity); From 61b49137c756f0bfd4ef9958020e60dc3ab7a087 Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 18:03:00 +0300 Subject: [PATCH 06/16] Add single hash cn/2. --- algo/cryptonight/cryptonight.c | 4 +- algo/cryptonight/cryptonight_av1.c | 53 ++++++++++++++++ algo/cryptonight/cryptonight_av3.c | 54 ++++++++++++++++ algo/cryptonight/cryptonight_monero.h | 91 ++++++++++----------------- 4 files changed, 142 insertions(+), 60 deletions(-) diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c index cd91f9a8..a501c61c 100644 --- a/algo/cryptonight/cryptonight.c +++ b/algo/cryptonight/cryptonight.c @@ -41,8 +41,6 @@ #include "cryptonight_test.h" #include "options.h" -#include "utils/applog.h" - void cryptonight_av1_v0(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); void cryptonight_av1_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); @@ -101,7 +99,7 @@ static bool self_test() { if (opt_algo == ALGO_CRYPTONIGHT) { result = verify(VARIANT_0, output, ctx, test_output_v0) && verify(VARIANT_1, output, ctx, test_output_v1) && - verify(VARIANT_0, output, ctx, test_output_v0); + verify(VARIANT_2, output, ctx, test_output_v2); } else { result = verify(VARIANT_0, output, ctx, test_output_v0_lite) && diff --git a/algo/cryptonight/cryptonight_av1.c b/algo/cryptonight/cryptonight_av1.c index 4028dd5d..9ef83b07 100644 --- a/algo/cryptonight/cryptonight_av1.c +++ b/algo/cryptonight/cryptonight_av1.c @@ -136,5 +136,58 @@ void cryptonight_av1_v1(const uint8_t *restrict input, size_t size, uint8_t *res void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + VARIANT2_INIT(0); + VARIANT2_SET_ROUNDING_MODE(); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + + uint64_t idx0 = al0; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + + cx = _mm_aesenc_si128(cx, ax0); + + VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = _mm_cvtsi128_si64(cx); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(0, cl, cx); + lo = _umul128(idx0, cl, &hi); + VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + bx1 = bx0; + bx0 = cx; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } diff --git a/algo/cryptonight/cryptonight_av3.c b/algo/cryptonight/cryptonight_av3.c index a70197ce..f15042b9 100644 --- a/algo/cryptonight/cryptonight_av3.c +++ b/algo/cryptonight/cryptonight_av3.c @@ -136,4 +136,58 @@ void cryptonight_av3_v1(const uint8_t *restrict input, size_t size, uint8_t *res void cryptonight_av3_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { + keccak(input, size, ctx[0]->state, 200); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + const uint8_t* l0 = ctx[0]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + + VARIANT2_INIT(0); + VARIANT2_SET_ROUNDING_MODE(); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + + uint64_t idx0 = al0; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + + cx = soft_aesenc(cx, ax0); + + VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx0, cx)); + + idx0 = _mm_cvtsi128_si64(cx); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(0, cl, cx); + lo = _umul128(idx0, cl, &hi); + VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx0, bx1, hi, lo); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + bx1 = bx0; + bx0 = cx; + } + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + + keccakf(h0, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } diff --git a/algo/cryptonight/cryptonight_monero.h b/algo/cryptonight/cryptonight_monero.h index 44ac27b0..2f64ad0a 100644 --- a/algo/cryptonight/cryptonight_monero.h +++ b/algo/cryptonight/cryptonight_monero.h @@ -31,45 +31,64 @@ #include -#define VARIANT1_INIT(part) \ +static inline __m128i int_sqrt_v2(const uint64_t n0) +{ + __m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52))); + x = _mm_sqrt_sd(_mm_setzero_pd(), x); + uint64_t r = (uint64_t)(_mm_cvtsi128_si64(_mm_castpd_si128(x))); + + const uint64_t s = r >> 20; + r >>= 19; + + uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1); +# if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64)) + _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r); +# else + if (x2 < n0) ++r; +# endif + + return _mm_cvtsi64_si128(r); +} + + +# define VARIANT1_INIT(part) \ uint64_t tweak1_2_##part = (*(const uint64_t*)(input + 35 + part * size) ^ \ *((const uint64_t*)(ctx[part]->state) + 24)); \ -#ifndef XMRIG_ARM # define VARIANT2_INIT(part) \ __m128i division_result_xmm_##part = _mm_cvtsi64_si128(h##part[12]); \ __m128i sqrt_result_xmm_##part = _mm_cvtsi64_si128(h##part[13]); #ifdef _MSC_VER -# define VARIANT2_SET_ROUNDING_MODE() if (VARIANT == xmrig::VARIANT_2) { _control87(RC_DOWN, MCW_RC); } +# define VARIANT2_SET_ROUNDING_MODE() { _control87(RC_DOWN, MCW_RC); } #else -# define VARIANT2_SET_ROUNDING_MODE() if (VARIANT == xmrig::VARIANT_2) { fesetround(FE_DOWNWARD); } +# define VARIANT2_SET_ROUNDING_MODE() { fesetround(FE_DOWNWARD); } #endif # define VARIANT2_INTEGER_MATH(part, cl, cx) \ - do { \ - const uint64_t sqrt_result = static_cast(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \ + { \ + const uint64_t sqrt_result = (uint64_t)(_mm_cvtsi128_si64(sqrt_result_xmm_##part)); \ const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \ - cl ^= static_cast(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \ - const uint32_t d = static_cast(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \ + cl ^= (uint64_t)(_mm_cvtsi128_si64(division_result_xmm_##part)) ^ (sqrt_result << 32); \ + const uint32_t d = (uint32_t)(cx_0 + (sqrt_result << 1)) | 0x80000001UL; \ const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ - const uint64_t division_result = static_cast(cx_1 / d) + ((cx_1 % d) << 32); \ - division_result_xmm_##part = _mm_cvtsi64_si128(static_cast(division_result)); \ + const uint64_t division_result = (uint32_t)(cx_1 / d) + ((cx_1 % d) << 32); \ + division_result_xmm_##part = _mm_cvtsi64_si128((int64_t)(division_result)); \ sqrt_result_xmm_##part = int_sqrt_v2(cx_0 + division_result); \ - } while (0) + } # define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \ - do { \ + { \ const __m128i chunk1 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))); \ const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ const __m128i chunk3 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30))); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ - } while (0) + } # define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ - do { \ + { \ const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10))), _mm_set_epi64x(lo, hi)); \ const __m128i chunk2 = _mm_load_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20))); \ hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \ @@ -78,48 +97,6 @@ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x10)), _mm_add_epi64(chunk3, _b1)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x20)), _mm_add_epi64(chunk1, _b)); \ _mm_store_si128((__m128i *)((base_ptr) + ((offset) ^ 0x30)), _mm_add_epi64(chunk2, _a)); \ - } while (0) + } -#else -# define VARIANT2_INIT(part) \ - uint64_t division_result_##part = h##part[12]; \ - uint64_t sqrt_result_##part = h##part[13]; - -# define VARIANT2_INTEGER_MATH(part, cl, cx) \ - do { \ - const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \ - cl ^= division_result_##part ^ (sqrt_result_##part << 32); \ - const uint32_t d = static_cast(cx_0 + (sqrt_result_##part << 1)) | 0x80000001UL; \ - const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \ - division_result_##part = static_cast(cx_1 / d) + ((cx_1 % d) << 32); \ - const uint64_t sqrt_input = cx_0 + division_result_##part; \ - sqrt_result_##part = sqrt(sqrt_input + 18446744073709551616.0) * 2.0 - 8589934592.0; \ - const uint64_t s = sqrt_result_##part >> 1; \ - const uint64_t b = sqrt_result_##part & 1; \ - const uint64_t r2 = (uint64_t)(s) * (s + b) + (sqrt_result_##part << 32); \ - sqrt_result_##part += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \ - } while (0) - -# define VARIANT2_SHUFFLE(base_ptr, offset, _a, _b, _b1) \ - do { \ - const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))); \ - const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \ - const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ - } while (0) - -# define VARIANT2_SHUFFLE2(base_ptr, offset, _a, _b, _b1, hi, lo) \ - do { \ - const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \ - const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20))); \ - hi ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[0]; \ - lo ^= ((uint64_t*)((base_ptr) + ((offset) ^ 0x20)))[1]; \ - const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(_b1))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(_b))); \ - vst1q_u64((uint64_t*)((base_ptr) + ((offset) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(_a))); \ - } while (0) -#endif #endif /* XMRIG_CRYPTONIGHT_MONERO_H */ From 1e22a984af38bba253945ec51fce0d09aedad78e Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 19:25:09 +0300 Subject: [PATCH 07/16] Add double hash cn/2. --- algo/cryptonight/cryptonight_av2.c | 96 +++++++++++++++++++++++++++++ algo/cryptonight/cryptonight_av4.c | 99 +++++++++++++++++++++++++++++- 2 files changed, 194 insertions(+), 1 deletion(-) diff --git a/algo/cryptonight/cryptonight_av2.c b/algo/cryptonight/cryptonight_av2.c index 7e5f4109..654dd5bc 100644 --- a/algo/cryptonight/cryptonight_av2.c +++ b/algo/cryptonight/cryptonight_av2.c @@ -204,5 +204,101 @@ void cryptonight_av2_v1(const uint8_t *restrict input, size_t size, uint8_t *res void cryptonight_av2_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + VARIANT2_INIT(0); + VARIANT2_INIT(1); + VARIANT2_SET_ROUNDING_MODE(); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = al0; + uint64_t idx1 = al1; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + cx0 = _mm_aesenc_si128(cx0, ax0); + cx1 = _mm_aesenc_si128(cx1, ax1); + + VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); + + VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); + _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); + + idx0 = _mm_cvtsi128_si64(cx0); + idx1 = _mm_cvtsi128_si64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(0, cl, cx0); + lo = _umul128(idx0, cl, &hi); + VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(1, cl, cx1); + lo = _umul128(idx1, cl, &hi); + VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); + + al1 += hi; + ah1 += lo; + + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; + + al1 ^= cl; + ah1 ^= ch; + idx1 = al1; + + bx01 = bx00; + bx11 = bx10; + + bx00 = cx0; + bx10 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); } diff --git a/algo/cryptonight/cryptonight_av4.c b/algo/cryptonight/cryptonight_av4.c index bb484095..5ff299c1 100644 --- a/algo/cryptonight/cryptonight_av4.c +++ b/algo/cryptonight/cryptonight_av4.c @@ -202,6 +202,103 @@ void cryptonight_av4_v1(const uint8_t *restrict input, size_t size, uint8_t *res } -void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx *restrict ctx) +void cryptonight_av4_v2(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) { + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + const uint8_t* l0 = ctx[0]->memory; + const uint8_t* l1 = ctx[1]->memory; + uint64_t* h0 = (uint64_t*) ctx[0]->state; + uint64_t* h1 = (uint64_t*) ctx[1]->state; + + VARIANT2_INIT(0); + VARIANT2_INIT(1); + VARIANT2_SET_ROUNDING_MODE(); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^ h0[4]; + uint64_t al1 = h1[0] ^ h1[4]; + uint64_t ah0 = h0[1] ^ h0[5]; + uint64_t ah1 = h1[1] ^ h1[5]; + + __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx01 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]); + __m128i bx10 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]); + + uint64_t idx0 = al0; + uint64_t idx1 = al1; + + for (size_t i = 0; __builtin_expect(i < 0x80000, 1); i++) { + __m128i cx0 = _mm_load_si128((__m128i *) &l0[idx0 & 0x1FFFF0]); + __m128i cx1 = _mm_load_si128((__m128i *) &l1[idx1 & 0x1FFFF0]); + + const __m128i ax0 = _mm_set_epi64x(ah0, al0); + const __m128i ax1 = _mm_set_epi64x(ah1, al1); + + cx0 = soft_aesenc(cx0, ax0); + cx1 = soft_aesenc(cx1, ax1); + + VARIANT2_SHUFFLE(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01); + _mm_store_si128((__m128i *) &l0[idx0 & 0x1FFFF0], _mm_xor_si128(bx00, cx0)); + + VARIANT2_SHUFFLE(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11); + _mm_store_si128((__m128i *) &l1[idx1 & 0x1FFFF0], _mm_xor_si128(bx10, cx1)); + + idx0 = _mm_cvtsi128_si64(cx0); + idx1 = _mm_cvtsi128_si64(cx1); + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l0[idx0 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(0, cl, cx0); + lo = _umul128(idx0, cl, &hi); + VARIANT2_SHUFFLE2(l0, idx0 & 0x1FFFF0, ax0, bx00, bx01, hi, lo); + + al0 += hi; + ah0 += lo; + + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[0] = al0; + ((uint64_t*)&l0[idx0 & 0x1FFFF0])[1] = ah0; + + al0 ^= cl; + ah0 ^= ch; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[0]; + ch = ((uint64_t*) &l1[idx1 & 0x1FFFF0])[1]; + + VARIANT2_INTEGER_MATH(1, cl, cx1); + lo = _umul128(idx1, cl, &hi); + VARIANT2_SHUFFLE2(l1, idx1 & 0x1FFFF0, ax1, bx10, bx11, hi, lo); + + al1 += hi; + ah1 += lo; + + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[0] = al1; + ((uint64_t*)&l1[idx1 & 0x1FFFF0])[1] = ah1; + + al1 ^= cl; + ah1 ^= ch; + idx1 = al1; + + bx01 = bx00; + bx11 = bx10; + + bx00 = cx0; + bx10 = cx1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); } From 0bba8849f0385d8d9286509f86e497a00b6647d0 Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 20:00:18 +0300 Subject: [PATCH 08/16] Fix Linux build. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fa22d7f..e7417c76 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -76,7 +76,7 @@ elseif (APPLE) set(SOURCES_OS mac/cpu_mac.c mac/memory_mac.c mac/xmrig_mac.c) else() set(SOURCES_OS unix/cpu_unix.c unix/memory_unix.c unix/xmrig_unix.c) - set(EXTRA_LIBS pthread) + set(EXTRA_LIBS pthread rt m) endif() include_directories(.) From bf2eb1a6853c5aab4eec9f62964fce249c374a51 Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 20:11:47 +0300 Subject: [PATCH 09/16] Fix misaligned access. --- memory.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/memory.c b/memory.c index b8a9eb65..f40646e6 100644 --- a/memory.c +++ b/memory.c @@ -4,8 +4,7 @@ * Copyright 2014 Lucas Jones * Copyright 2014-2016 Wolf9466 * Copyright 2016 Jay D Dee - * Copyright 2016-2017 XMRig - * + * Copyright 2016-2018 XMRig * * This program is free software: you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -31,6 +30,8 @@ static size_t offset = 0; void * persistent_calloc(size_t num, size_t size) { + size += size % 16; + void *mem = &persistent_memory[offset]; offset += (num * size); From a06a224c0ad9016a4b6271244924fffc574920db Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 20:27:29 +0300 Subject: [PATCH 10/16] Implement --variant option. --- options.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/options.c b/options.c index a4cec796..2d9cd818 100644 --- a/options.c +++ b/options.c @@ -341,6 +341,13 @@ static void parse_arg(int key, char *arg) { opt_donate_level = v; break; + case 1021: /* --variant */ + v = atoi(arg); + if (v > VARIANT_AUTO && v < VARIANT_MAX) { + opt_variant = v; + } + break; + case 1006: /* --nicehash */ opt_nicehash = true; break; From 779238fc858e236ce058c84730a6c94e3f2b7ffd Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 22:06:08 +0300 Subject: [PATCH 11/16] Add support for new style algorithm names. --- options.c | 49 +++++++++++++++++++++++++++++++++++++------------ options.h | 3 ++- utils/summary.c | 4 ++-- 3 files changed, 41 insertions(+), 15 deletions(-) diff --git a/options.c b/options.c index 2d9cd818..41921784 100644 --- a/options.c +++ b/options.c @@ -59,6 +59,30 @@ enum Variant opt_variant = VARIANT_AUTO; enum AlgoVariant opt_av = AV_AUTO; +struct AlgoData +{ + const char *name; + const char *shortName; + enum Algo algo; + enum Variant variant; +}; + + +static struct AlgoData const algorithms[] = { + { "cryptonight", "cn", ALGO_CRYPTONIGHT, VARIANT_AUTO }, + { "cryptonight/0", "cn/0", ALGO_CRYPTONIGHT, VARIANT_0 }, + { "cryptonight/1", "cn/1", ALGO_CRYPTONIGHT, VARIANT_1 }, + { "cryptonight/2", "cn/2", ALGO_CRYPTONIGHT, VARIANT_2 }, + +# ifndef XMRIG_NO_AEON + { "cryptonight-lite", "cn-lite", ALGO_CRYPTONIGHT_LITE, VARIANT_AUTO }, + { "cryptonight-light", "cn-light", ALGO_CRYPTONIGHT_LITE, VARIANT_AUTO }, + { "cryptonight-lite/0", "cn-lite/0", ALGO_CRYPTONIGHT_LITE, VARIANT_0 }, + { "cryptonight-lite/1", "cn-lite/1", ALGO_CRYPTONIGHT_LITE, VARIANT_1 }, +# endif +}; + + static char const usage[] = "\ Usage: " APP_ID " [OPTIONS]\n\ Options:\n\ @@ -126,7 +150,7 @@ static const char *algo_names[] = { static const char *variant_names[] = { - "auto" + "auto", "0", "1", "2", @@ -178,18 +202,13 @@ static void parse_arg(int key, char *arg) { switch (key) { - case 'a': - for (int i = 0; i < ARRAY_SIZE(algo_names); i++) { - if (algo_names[i] && !strcmp(arg, algo_names[i])) { - opt_algo = i; + case 'a': /* --algo */ + for (size_t i = 0; i < ARRAY_SIZE(algorithms); i++) { + if ((strcasecmp(arg, algorithms[i].name) == 0) || (strcasecmp(arg, algorithms[i].shortName) == 0)) { + opt_algo = algorithms[i].algo; + opt_variant = algorithms[i].variant; break; } - -# ifndef XMRIG_NO_AEON - if (i == ARRAY_SIZE(algo_names) - 1 && !strcmp(arg, "cryptonight-light")) { - opt_algo = i = ALGO_CRYPTONIGHT_LITE; - } -# endif } break; @@ -529,6 +548,12 @@ void show_version_and_exit(void) { } -const char* get_current_algo_name(void) { +const char *get_current_algo_name(void) { return algo_names[opt_algo]; } + + +const char *get_current_variant_name(void) +{ + return variant_names[opt_variant + 1]; +} diff --git a/options.h b/options.h index 0dffb1cc..4f543275 100644 --- a/options.h +++ b/options.h @@ -94,7 +94,8 @@ extern enum AlgoVariant opt_av; void parse_cmdline(int argc, char *argv[]); void show_usage_and_exit(int status); void show_version_and_exit(void); -const char* get_current_algo_name(void); +const char *get_current_algo_name(void); +const char *get_current_variant_name(void); extern void proper_exit(int reason); diff --git a/utils/summary.c b/utils/summary.c index 85cf0e41..7f109acf 100644 --- a/utils/summary.c +++ b/utils/summary.c @@ -77,10 +77,10 @@ static void print_threads() { } if (opt_colors) { - applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS: " CL_WHT "%d" CL_WHT ", av=%d, %s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), opt_donate_level, extra); + applog_notime(LOG_INFO, CL_LGR " * " CL_WHT "THREADS: " CL_WHT "%d" CL_WHT ", av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra); } else { - applog_notime(LOG_INFO, " * THREADS: %d, av=%d, %s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), opt_donate_level, extra); + applog_notime(LOG_INFO, " * THREADS: %d, av=%d, %s/%s, donate=%d%%%s", opt_n_threads, opt_av, get_current_algo_name(), get_current_variant_name(), opt_donate_level, extra); } } From e0dc51edf9867d082f90ff4fbbae5df90aa3471c Mon Sep 17 00:00:00 2001 From: XMRig Date: Thu, 4 Oct 2018 22:12:33 +0300 Subject: [PATCH 12/16] Fixed build without cn-lite. --- algo/cryptonight/cryptonight.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c index a501c61c..728e5822 100644 --- a/algo/cryptonight/cryptonight.c +++ b/algo/cryptonight/cryptonight.c @@ -67,8 +67,6 @@ void cryptonight_lite_av4_v0(const uint8_t *input, size_t size, uint8_t *output, void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); #endif -void (*cryptonight_hash_ctx)(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx, uint8_t version) = NULL; - static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue) { @@ -101,10 +99,12 @@ static bool self_test() { verify(VARIANT_1, output, ctx, test_output_v1) && verify(VARIANT_2, output, ctx, test_output_v2); } +# ifndef XMRIG_NO_AEON else { result = verify(VARIANT_0, output, ctx, test_output_v0_lite) && verify(VARIANT_1, output, ctx, test_output_v1_lite); } +# endif for (int i = 0; i < count; ++i) { From 11748fad78e31c513a58eb033b49f034efecba93 Mon Sep 17 00:00:00 2001 From: XMRig Date: Fri, 5 Oct 2018 15:02:52 +0300 Subject: [PATCH 13/16] Add ASM code. --- CMakeLists.txt | 11 +- algo/cryptonight/cryptonight.c | 68 ++- algo/cryptonight/cryptonight_av1.c | 54 +++ cmake/asm.cmake | 33 ++ cpu.c | 10 + cpu.h | 7 +- .../asm/cnv2_double_main_loop_sandybridge.inc | 410 ++++++++++++++++++ crypto/asm/cnv2_main_loop.S | 37 ++ crypto/asm/cnv2_main_loop.asm | 25 ++ crypto/asm/cnv2_main_loop_ivybridge.inc | 186 ++++++++ crypto/asm/cnv2_main_loop_ryzen.inc | 179 ++++++++ crypto/asm/win64/cnv2_main_loop.S | 21 + options.c | 30 +- options.h | 17 +- 14 files changed, 1062 insertions(+), 26 deletions(-) create mode 100644 cmake/asm.cmake create mode 100644 crypto/asm/cnv2_double_main_loop_sandybridge.inc create mode 100644 crypto/asm/cnv2_main_loop.S create mode 100644 crypto/asm/cnv2_main_loop.asm create mode 100644 crypto/asm/cnv2_main_loop_ivybridge.inc create mode 100644 crypto/asm/cnv2_main_loop_ryzen.inc create mode 100644 crypto/asm/win64/cnv2_main_loop.S diff --git a/CMakeLists.txt b/CMakeLists.txt index e7417c76..51b8b167 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,6 +3,7 @@ project(xmrig C) option(WITH_LIBCPUID "Use Libcpuid" ON) option(WITH_AEON "CryptoNight-Lite support" ON) +option(WITH_ASM "Enable ASM PoW implementations" ON) set(HEADERS algo/cryptonight/cryptonight.h @@ -125,6 +126,8 @@ else() set(SOURCES_CPUID cpu_stub.c) endif() +include(cmake/asm.cmake) + if (WITH_AEON) set(SOURCES_AEON algo/cryptonight-lite/cryptonight_lite_av1.c @@ -139,10 +142,10 @@ else() endif() if (CMAKE_SIZEOF_VOID_P EQUAL 8) - add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) - target_link_libraries(xmrig jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) + add_executable(xmrig ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES}) + target_link_libraries(xmrig ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) else() - add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON}) - target_link_libraries(xmrig32 jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) + add_executable(xmrig32 ${HEADERS} ${HEADERS_CRYPTO} ${SOURCES} ${SOURCES_CRYPTO} ${HEADERS_UTILS} ${SOURCES_UTILS} ${HEADERS_COMPAT} ${SOURCES_COMPAT} ${SOURCES_OS} ${SOURCES_CPUID} ${SOURCES_AEON} ${XMRIG_ASM_SOURCES}) + target_link_libraries(xmrig32 ${XMRIG_ASM_LIBRARY} jansson ${CURL_LIBRARY} ${CPUID_LIB} ${EXTRA_LIBS}) endif() diff --git a/algo/cryptonight/cryptonight.c b/algo/cryptonight/cryptonight.c index 728e5822..62dbdc50 100644 --- a/algo/cryptonight/cryptonight.c +++ b/algo/cryptonight/cryptonight.c @@ -33,6 +33,7 @@ # include "xmrig.h" #endif +#include "cpu.h" #include "crypto/c_blake256.h" #include "crypto/c_groestl.h" #include "crypto/c_jh.h" @@ -68,6 +69,13 @@ void cryptonight_lite_av4_v1(const uint8_t *input, size_t size, uint8_t *output, #endif +#ifndef XMRIG_NO_ASM +void cryptonight_single_hash_asm_intel(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_single_hash_asm_ryzen(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +void cryptonight_double_hash_asm(const uint8_t *input, size_t size, uint8_t *output, struct cryptonight_ctx **ctx); +#endif + + static inline bool verify(enum Variant variant, uint8_t *output, struct cryptonight_ctx **ctx, const uint8_t *referenceValue) { cn_hash_fun func = cryptonight_hash_fn(opt_algo, opt_av, variant); @@ -116,12 +124,46 @@ static bool self_test() { } +size_t fn_index(enum Algo algorithm, enum AlgoVariant av, enum Variant variant, enum Assembly assembly) +{ + const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1; + +# ifndef XMRIG_NO_ASM + if (assembly == ASM_AUTO) { + assembly = cpu_info.assembly; + } + + if (assembly == ASM_NONE) { + return index; + } + + const size_t offset = VARIANT_MAX * 4 * 2; + + if (algorithm == ALGO_CRYPTONIGHT && variant == VARIANT_2) { + if (av == AV_SINGLE) { + return offset + assembly - 2; + } + + if (av == AV_DOUBLE) { + return offset + 2; + } + } +# endif + + return index; +} + + cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum Variant variant) { assert(av > AV_AUTO && av < AV_MAX); assert(variant > VARIANT_AUTO && variant < VARIANT_MAX); +# ifndef XMRIG_NO_ASM + static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2 + 3] = { +# else static const cn_hash_fun func_table[VARIANT_MAX * 4 * 2] = { +# endif cryptonight_av1_v0, cryptonight_av2_v0, cryptonight_av3_v0, @@ -147,13 +189,31 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V NULL, NULL, NULL, - NULL + NULL, +# else + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, +# endif +# ifndef XMRIG_NO_ASM + cryptonight_single_hash_asm_intel, + cryptonight_single_hash_asm_ryzen, + cryptonight_double_hash_asm # endif }; - const size_t index = VARIANT_MAX * 4 * algorithm + 4 * variant + av - 1; - # ifndef NDEBUG + const size_t index = fn_index(algorithm, av, variant, opt_assembly); + cn_hash_fun func = func_table[index]; assert(index < sizeof(func_table) / sizeof(func_table[0])); @@ -161,7 +221,7 @@ cn_hash_fun cryptonight_hash_fn(enum Algo algorithm, enum AlgoVariant av, enum V return func; # else - return func_table[index]; + return func_table[fn_index(algorithm, av, variant, opt_assembly)]; # endif } diff --git a/algo/cryptonight/cryptonight_av1.c b/algo/cryptonight/cryptonight_av1.c index 9ef83b07..c71635ea 100644 --- a/algo/cryptonight/cryptonight_av1.c +++ b/algo/cryptonight/cryptonight_av1.c @@ -191,3 +191,57 @@ void cryptonight_av1_v2(const uint8_t *restrict input, size_t size, uint8_t *res keccakf(h0, 24); extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); } + + +#ifndef XMRIG_NO_ASM +extern void cnv2_mainloop_ivybridge_asm(struct cryptonight_ctx *ctx); +extern void cnv2_mainloop_ryzen_asm(struct cryptonight_ctx *ctx); +extern void cnv2_double_mainloop_sandybridge_asm(struct cryptonight_ctx* ctx0, struct cryptonight_ctx* ctx1); + + +void cryptonight_single_hash_asm_intel(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + cnv2_mainloop_ivybridge_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t*) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_single_hash_asm_ryzen(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + + cnv2_mainloop_ryzen_asm(ctx[0]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + keccakf((uint64_t*) ctx[0]->state, 24); + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); +} + + +void cryptonight_double_hash_asm(const uint8_t *restrict input, size_t size, uint8_t *restrict output, struct cryptonight_ctx **restrict ctx) +{ + keccak(input, size, ctx[0]->state, 200); + keccak(input + size, size, ctx[1]->state, 200); + + cn_explode_scratchpad((__m128i*) ctx[0]->state, (__m128i*) ctx[0]->memory); + cn_explode_scratchpad((__m128i*) ctx[1]->state, (__m128i*) ctx[1]->memory); + + cnv2_double_mainloop_sandybridge_asm(ctx[0], ctx[1]); + + cn_implode_scratchpad((__m128i*) ctx[0]->memory, (__m128i*) ctx[0]->state); + cn_implode_scratchpad((__m128i*) ctx[1]->memory, (__m128i*) ctx[1]->state); + + keccakf((uint64_t*) ctx[0]->state, 24); + keccakf((uint64_t*) ctx[1]->state, 24); + + extra_hashes[ctx[0]->state[0] & 3](ctx[0]->state, 200, output); + extra_hashes[ctx[1]->state[0] & 3](ctx[1]->state, 200, output + 32); +} +#endif diff --git a/cmake/asm.cmake b/cmake/asm.cmake new file mode 100644 index 00000000..4420342c --- /dev/null +++ b/cmake/asm.cmake @@ -0,0 +1,33 @@ +if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8) + set(XMRIG_ASM_LIBRARY "xmrig-asm") + + if (CMAKE_C_COMPILER_ID MATCHES MSVC) + enable_language(ASM_MASM) + + if (MSVC_TOOLSET_VERSION GREATER_EQUAL 141) + set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.asm") + else() + set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.asm") + endif() + + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM) + else() + enable_language(ASM) + + if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU) + set(XMRIG_ASM_FILE "crypto/asm/win64/cnv2_main_loop.S") + else() + set(XMRIG_ASM_FILE "crypto/asm/cnv2_main_loop.S") + endif() + + set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C) + endif() + + add_library(${XMRIG_ASM_LIBRARY} STATIC ${XMRIG_ASM_FILE}) + set(XMRIG_ASM_SOURCES "") + set_property(TARGET ${XMRIG_ASM_LIBRARY} PROPERTY LINKER_LANGUAGE C) +else() + set(XMRIG_ASM_SOURCES "") + set(XMRIG_ASM_LIBRARY "") + add_definitions(/DXMRIG_NO_ASM) +endif() diff --git a/cpu.c b/cpu.c index 2f6ef8b6..0d28559a 100644 --- a/cpu.c +++ b/cpu.c @@ -31,6 +31,7 @@ #endif #include "cpu.h" +#include "options.h" #ifndef BUILD_TEST @@ -63,6 +64,15 @@ void cpu_init_common() { if (data.flags[CPU_FEATURE_AES]) { cpu_info.flags |= CPU_FLAG_AES; + +# ifndef XMRIG_NO_ASM + if (data.vendor == VENDOR_AMD) { + cpu_info.assembly = ASM_RYZEN; + } + else if (data.vendor == VENDOR_INTEL) { + cpu_info.assembly = ASM_INTEL; + } +# endif } if (data.flags[CPU_FEATURE_BMI2]) { diff --git a/cpu.h b/cpu.h index 419192bf..e9314bbe 100644 --- a/cpu.h +++ b/cpu.h @@ -21,8 +21,8 @@ * along with this program. If not, see . */ -#ifndef __CPU_H__ -#define __CPU_H__ +#ifndef XMRIG_CPU_H +#define XMRIG_CPU_H #include @@ -34,6 +34,7 @@ struct cpu_info { int l2_cache; int l3_cache; char brand[64]; + int assembly; }; extern struct cpu_info cpu_info; @@ -50,4 +51,4 @@ void cpu_init(); int get_optimal_threads_count(int algo, bool double_hash, int max_cpu_usage); int affine_to_cpu_mask(int id, unsigned long mask); -#endif /* __CPU_H__ */ +#endif /* XMRIG_CPU_H */ diff --git a/crypto/asm/cnv2_double_main_loop_sandybridge.inc b/crypto/asm/cnv2_double_main_loop_sandybridge.inc new file mode 100644 index 00000000..e8251bc7 --- /dev/null +++ b/crypto/asm/cnv2_double_main_loop_sandybridge.inc @@ -0,0 +1,410 @@ + mov rax, rsp + push rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 184 + + stmxcsr DWORD PTR [rsp+272] + mov DWORD PTR [rsp+276], 24448 + ldmxcsr DWORD PTR [rsp+276] + + mov r13, QWORD PTR [rcx+224] + mov r9, rdx + mov r10, QWORD PTR [rcx+32] + mov r8, rcx + xor r10, QWORD PTR [rcx] + mov r14d, 524288 + mov r11, QWORD PTR [rcx+40] + xor r11, QWORD PTR [rcx+8] + mov rsi, QWORD PTR [rdx+224] + mov rdx, QWORD PTR [rcx+56] + xor rdx, QWORD PTR [rcx+24] + mov rdi, QWORD PTR [r9+32] + xor rdi, QWORD PTR [r9] + mov rbp, QWORD PTR [r9+40] + xor rbp, QWORD PTR [r9+8] + movq xmm0, rdx + movaps XMMWORD PTR [rax-88], xmm6 + movaps XMMWORD PTR [rax-104], xmm7 + movaps XMMWORD PTR [rax-120], xmm8 + movaps XMMWORD PTR [rsp+112], xmm9 + movaps XMMWORD PTR [rsp+96], xmm10 + movaps XMMWORD PTR [rsp+80], xmm11 + movaps XMMWORD PTR [rsp+64], xmm12 + movaps XMMWORD PTR [rsp+48], xmm13 + movaps XMMWORD PTR [rsp+32], xmm14 + movaps XMMWORD PTR [rsp+16], xmm15 + mov rdx, r10 + movq xmm4, QWORD PTR [r8+96] + and edx, 2097136 + mov rax, QWORD PTR [rcx+48] + xorps xmm13, xmm13 + xor rax, QWORD PTR [rcx+16] + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r8+72] + movq xmm5, QWORD PTR [r8+104] + movq xmm7, rax + + mov eax, 1 + shl rax, 52 + movq xmm14, rax + punpcklqdq xmm14, xmm14 + + mov eax, 1023 + shl rax, 52 + movq xmm12, rax + punpcklqdq xmm12, xmm12 + + mov rax, QWORD PTR [r8+80] + xor rax, QWORD PTR [r8+64] + punpcklqdq xmm7, xmm0 + movq xmm0, rcx + mov rcx, QWORD PTR [r9+56] + xor rcx, QWORD PTR [r9+24] + movq xmm3, rax + mov rax, QWORD PTR [r9+48] + xor rax, QWORD PTR [r9+16] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp], r13 + mov rcx, QWORD PTR [r9+88] + xor rcx, QWORD PTR [r9+72] + movq xmm6, rax + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + punpcklqdq xmm6, xmm0 + movq xmm0, rcx + mov QWORD PTR [rsp+256], r10 + mov rcx, rdi + mov QWORD PTR [rsp+264], r11 + movq xmm8, rax + and ecx, 2097136 + punpcklqdq xmm8, xmm0 + movq xmm0, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, QWORD PTR [r9+104] + lea r8, QWORD PTR [rcx+rsi] + movdqu xmm11, XMMWORD PTR [r8] + punpcklqdq xmm5, xmm0 + lea r9, QWORD PTR [rdx+r13] + movdqu xmm15, XMMWORD PTR [r9] + + ALIGN 16 +main_loop_double_sandybridge: + movdqu xmm9, xmm15 + mov eax, edx + mov ebx, edx + xor eax, 16 + xor ebx, 32 + xor edx, 48 + + movq xmm0, r11 + movq xmm2, r10 + punpcklqdq xmm2, xmm0 + aesenc xmm9, xmm2 + + movdqu xmm0, XMMWORD PTR [rax+r13] + movdqu xmm1, XMMWORD PTR [rbx+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [rbx+r13], xmm0 + movdqu xmm0, XMMWORD PTR [rdx+r13] + movdqu XMMWORD PTR [rdx+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [rax+r13], xmm0 + + movq r11, xmm9 + mov edx, r11d + and edx, 2097136 + movdqa xmm0, xmm9 + pxor xmm0, xmm7 + movdqu XMMWORD PTR [r9], xmm0 + + lea rbx, QWORD PTR [rdx+r13] + mov r10, QWORD PTR [rdx+r13] + + movdqu xmm10, xmm11 + movq xmm0, rbp + movq xmm11, rdi + punpcklqdq xmm11, xmm0 + aesenc xmm10, xmm11 + + mov eax, ecx + mov r12d, ecx + xor eax, 16 + xor r12d, 32 + xor ecx, 48 + + movdqu xmm0, XMMWORD PTR [rax+rsi] + paddq xmm0, xmm6 + movdqu xmm1, XMMWORD PTR [r12+rsi] + movdqu XMMWORD PTR [r12+rsi], xmm0 + paddq xmm1, xmm11 + movdqu xmm0, XMMWORD PTR [rcx+rsi] + movdqu XMMWORD PTR [rcx+rsi], xmm1 + paddq xmm0, xmm8 + movdqu XMMWORD PTR [rax+rsi], xmm0 + + movq rcx, xmm10 + and ecx, 2097136 + + movdqa xmm0, xmm10 + pxor xmm0, xmm6 + movdqu XMMWORD PTR [r8], xmm0 + mov r12, QWORD PTR [rcx+rsi] + + mov r9, QWORD PTR [rbx+8] + + xor edx, 16 + mov r8d, edx + mov r15d, edx + + movq rdx, xmm5 + shl rdx, 32 + movq rax, xmm4 + xor rdx, rax + xor r10, rdx + mov rax, r10 + mul r11 + mov r11d, r8d + xor r11d, 48 + movq xmm0, rdx + xor rdx, [r11+r13] + movq xmm1, rax + xor rax, [r11+r13+8] + punpcklqdq xmm0, xmm1 + + pxor xmm0, XMMWORD PTR [r8+r13] + xor r8d, 32 + movdqu xmm1, XMMWORD PTR [r11+r13] + paddq xmm0, xmm7 + paddq xmm1, xmm2 + movdqu XMMWORD PTR [r11+r13], xmm0 + movdqu xmm0, XMMWORD PTR [r8+r13] + movdqu XMMWORD PTR [r8+r13], xmm1 + paddq xmm0, xmm3 + movdqu XMMWORD PTR [r15+r13], xmm0 + + mov r11, QWORD PTR [rsp+256] + add r11, rdx + mov rdx, QWORD PTR [rsp+264] + add rdx, rax + mov QWORD PTR [rbx], r11 + xor r11, r10 + mov QWORD PTR [rbx+8], rdx + xor rdx, r9 + mov QWORD PTR [rsp+256], r11 + and r11d, 2097136 + mov QWORD PTR [rsp+264], rdx + mov QWORD PTR [rsp+8], r11 + lea r15, QWORD PTR [r11+r13] + movdqu xmm15, XMMWORD PTR [r11+r13] + lea r13, QWORD PTR [rsi+rcx] + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movaps xmm2, xmm13 + movq r10, xmm0 + psllq xmm5, 1 + shl r10, 32 + movdqa xmm0, xmm9 + psrldq xmm0, 8 + movdqa xmm1, xmm10 + movq r11, xmm0 + psrldq xmm1, 8 + movq r8, xmm1 + psrldq xmm4, 8 + movaps xmm0, xmm13 + movq rax, xmm4 + xor r10, rax + movaps xmm1, xmm13 + xor r10, r12 + lea rax, QWORD PTR [r11+1] + shr rax, 1 + movdqa xmm3, xmm9 + punpcklqdq xmm3, xmm10 + paddq xmm5, xmm3 + movq rdx, xmm5 + psrldq xmm5, 8 + cvtsi2sd xmm2, rax + or edx, -2147483647 + lea rax, QWORD PTR [r8+1] + shr rax, 1 + movq r9, xmm5 + cvtsi2sd xmm0, rax + or r9d, -2147483647 + cvtsi2sd xmm1, rdx + unpcklpd xmm2, xmm0 + movaps xmm0, xmm13 + cvtsi2sd xmm0, r9 + unpcklpd xmm1, xmm0 + divpd xmm2, xmm1 + paddq xmm2, xmm14 + cvttsd2si rax, xmm2 + psrldq xmm2, 8 + mov rbx, rax + imul rax, rdx + sub r11, rax + js div_fix_1_sandybridge +div_fix_1_ret_sandybridge: + + cvttsd2si rdx, xmm2 + mov rax, rdx + imul rax, r9 + movd xmm2, r11d + movd xmm4, ebx + sub r8, rax + js div_fix_2_sandybridge +div_fix_2_ret_sandybridge: + + movd xmm1, r8d + movd xmm0, edx + punpckldq xmm2, xmm1 + punpckldq xmm4, xmm0 + punpckldq xmm4, xmm2 + paddq xmm3, xmm4 + movdqa xmm0, xmm3 + psrlq xmm0, 12 + paddq xmm0, xmm12 + sqrtpd xmm1, xmm0 + movq r9, xmm1 + movdqa xmm5, xmm1 + psrlq xmm5, 19 + test r9, 524287 + je sqrt_fix_1_sandybridge +sqrt_fix_1_ret_sandybridge: + + movq r9, xmm10 + psrldq xmm1, 8 + movq r8, xmm1 + test r8, 524287 + je sqrt_fix_2_sandybridge +sqrt_fix_2_ret_sandybridge: + + mov r12d, ecx + mov r8d, ecx + xor r12d, 16 + xor r8d, 32 + xor ecx, 48 + mov rax, r10 + mul r9 + movq xmm0, rax + movq xmm3, rdx + punpcklqdq xmm3, xmm0 + + movdqu xmm0, XMMWORD PTR [r12+rsi] + pxor xmm0, xmm3 + movdqu xmm1, XMMWORD PTR [r8+rsi] + xor rdx, [r8+rsi] + xor rax, [r8+rsi+8] + movdqu xmm3, XMMWORD PTR [rcx+rsi] + paddq xmm0, xmm6 + paddq xmm1, xmm11 + paddq xmm3, xmm8 + movdqu XMMWORD PTR [r8+rsi], xmm0 + movdqu XMMWORD PTR [rcx+rsi], xmm1 + movdqu XMMWORD PTR [r12+rsi], xmm3 + + add rdi, rdx + mov QWORD PTR [r13], rdi + xor rdi, r10 + mov ecx, edi + and ecx, 2097136 + lea r8, QWORD PTR [rcx+rsi] + + mov rdx, QWORD PTR [r13+8] + add rbp, rax + mov QWORD PTR [r13+8], rbp + movdqu xmm11, XMMWORD PTR [rcx+rsi] + xor rbp, rdx + mov r13, QWORD PTR [rsp] + movdqa xmm3, xmm7 + mov rdx, QWORD PTR [rsp+8] + movdqa xmm8, xmm6 + mov r10, QWORD PTR [rsp+256] + movdqa xmm7, xmm9 + mov r11, QWORD PTR [rsp+264] + movdqa xmm6, xmm10 + mov r9, r15 + dec r14d + jne main_loop_double_sandybridge + + ldmxcsr DWORD PTR [rsp+272] + movaps xmm13, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+184] + movaps xmm6, XMMWORD PTR [r11-24] + movaps xmm7, XMMWORD PTR [r11-40] + movaps xmm8, XMMWORD PTR [r11-56] + movaps xmm9, XMMWORD PTR [r11-72] + movaps xmm10, XMMWORD PTR [r11-88] + movaps xmm11, XMMWORD PTR [r11-104] + movaps xmm12, XMMWORD PTR [r11-120] + movaps xmm14, XMMWORD PTR [rsp+32] + movaps xmm15, XMMWORD PTR [rsp+16] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + pop rbx + jmp cnv2_double_mainloop_asm_sandybridge_endp + +div_fix_1_sandybridge: + dec rbx + add r11, rdx + jmp div_fix_1_ret_sandybridge + +div_fix_2_sandybridge: + dec rdx + add r8, r9 + jmp div_fix_2_ret_sandybridge + +sqrt_fix_1_sandybridge: + movq r8, xmm3 + movdqa xmm0, xmm5 + psrldq xmm0, 8 + dec r9 + mov r11d, -1022 + shl r11, 32 + mov rax, r9 + shr r9, 19 + shr rax, 20 + mov rdx, r9 + sub rdx, rax + lea rdx, [rdx+r11+1] + add rax, r11 + imul rdx, rax + sub rdx, r8 + adc r9, 0 + movq xmm5, r9 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_1_ret_sandybridge + +sqrt_fix_2_sandybridge: + psrldq xmm3, 8 + movq r11, xmm3 + dec r8 + mov ebx, -1022 + shl rbx, 32 + mov rax, r8 + shr r8, 19 + shr rax, 20 + mov rdx, r8 + sub rdx, rax + lea rdx, [rdx+rbx+1] + add rax, rbx + imul rdx, rax + sub rdx, r11 + adc r8, 0 + movq xmm0, r8 + punpcklqdq xmm5, xmm0 + jmp sqrt_fix_2_ret_sandybridge + +cnv2_double_mainloop_asm_sandybridge_endp: diff --git a/crypto/asm/cnv2_main_loop.S b/crypto/asm/cnv2_main_loop.S new file mode 100644 index 00000000..4dbcbbda --- /dev/null +++ b/crypto/asm/cnv2_main_loop.S @@ -0,0 +1,37 @@ +#define ALIGN .align +.intel_syntax noprefix +#ifdef __APPLE__ +# define FN_PREFIX(fn) _ ## fn +.text +#else +# define FN_PREFIX(fn) fn +.section .text +#endif +.global FN_PREFIX(cnv2_mainloop_ivybridge_asm) +.global FN_PREFIX(cnv2_mainloop_ryzen_asm) +.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm) + +ALIGN 16 +FN_PREFIX(cnv2_mainloop_ivybridge_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_ivybridge.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cnv2_mainloop_ryzen_asm): + sub rsp, 48 + mov rcx, rdi + #include "cnv2_main_loop_ryzen.inc" + add rsp, 48 + ret 0 + +ALIGN 16 +FN_PREFIX(cnv2_double_mainloop_sandybridge_asm): + sub rsp, 48 + mov rcx, rdi + mov rdx, rsi + #include "cnv2_double_main_loop_sandybridge.inc" + add rsp, 48 + ret 0 diff --git a/crypto/asm/cnv2_main_loop.asm b/crypto/asm/cnv2_main_loop.asm new file mode 100644 index 00000000..d9522267 --- /dev/null +++ b/crypto/asm/cnv2_main_loop.asm @@ -0,0 +1,25 @@ +_TEXT_CNV2_MAINLOOP SEGMENT PAGE READ EXECUTE +PUBLIC cnv2_mainloop_ivybridge_asm +PUBLIC cnv2_mainloop_ryzen_asm +PUBLIC cnv2_double_mainloop_sandybridge_asm + +ALIGN 64 +cnv2_mainloop_ivybridge_asm PROC + INCLUDE cnv2_main_loop_ivybridge.inc + ret 0 +cnv2_mainloop_ivybridge_asm ENDP + +ALIGN 64 +cnv2_mainloop_ryzen_asm PROC + INCLUDE cnv2_main_loop_ryzen.inc + ret 0 +cnv2_mainloop_ryzen_asm ENDP + +ALIGN 64 +cnv2_double_mainloop_sandybridge_asm PROC + INCLUDE cnv2_double_main_loop_sandybridge.inc + ret 0 +cnv2_double_mainloop_sandybridge_asm ENDP + +_TEXT_CNV2_MAINLOOP ENDS +END diff --git a/crypto/asm/cnv2_main_loop_ivybridge.inc b/crypto/asm/cnv2_main_loop_ivybridge.inc new file mode 100644 index 00000000..8c2c2d3b --- /dev/null +++ b/crypto/asm/cnv2_main_loop_ivybridge.inc @@ -0,0 +1,186 @@ + mov QWORD PTR [rsp+24], rbx + push rbp + push rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 80 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov esi, 524288 + mov r8, QWORD PTR [rcx+32] + mov r13d, -2147483647 + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm4, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + movq xmm3, QWORD PTR [r9+104] + movaps XMMWORD PTR [rsp+64], xmm6 + movaps XMMWORD PTR [rsp+48], xmm7 + movaps XMMWORD PTR [rsp+32], xmm8 + and r10d, 2097136 + movq xmm5, rax + + xor eax, eax + mov QWORD PTR [rsp+16], rax + + mov ax, 1023 + shl rax, 52 + movq xmm8, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm4, xmm0 + movq xmm0, rcx + punpcklqdq xmm5, xmm0 + movdqu xmm6, XMMWORD PTR [r10+rbx] + + ALIGN 16 +main_loop_ivybridge: + lea rdx, QWORD PTR [r10+rbx] + mov ecx, r10d + mov eax, r10d + mov rdi, r15 + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + movq xmm0, r11 + movq xmm7, r8 + punpcklqdq xmm7, xmm0 + aesenc xmm6, xmm7 + movq rbp, xmm6 + mov r9, rbp + and r9d, 2097136 + movdqu xmm2, XMMWORD PTR [rcx+rbx] + movdqu xmm1, XMMWORD PTR [rax+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm1, xmm7 + paddq xmm0, xmm5 + paddq xmm2, xmm4 + movdqu XMMWORD PTR [rcx+rbx], xmm0 + movdqu XMMWORD PTR [rax+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + mov r10, r9 + xor r10d, 32 + movq rcx, xmm3 + mov rax, rcx + shl rax, 32 + xor rdi, rax + movdqa xmm0, xmm6 + pxor xmm0, xmm4 + movdqu XMMWORD PTR [rdx], xmm0 + xor rdi, QWORD PTR [r9+rbx] + lea r14, QWORD PTR [r9+rbx] + mov r12, QWORD PTR [r14+8] + xor edx, edx + lea r9d, DWORD PTR [ecx+ecx] + add r9d, ebp + movdqa xmm0, xmm6 + psrldq xmm0, 8 + or r9d, r13d + movq rax, xmm0 + div r9 + xorps xmm3, xmm3 + mov eax, eax + shl rdx, 32 + add rdx, rax + lea r9, QWORD PTR [rdx+rbp] + mov r15, rdx + mov rax, r9 + shr rax, 12 + movq xmm0, rax + paddq xmm0, xmm8 + sqrtsd xmm3, xmm0 + psubq xmm3, XMMWORD PTR [rsp+16] + movq rdx, xmm3 + test edx, 524287 + je sqrt_fixup_ivybridge + psrlq xmm3, 19 +sqrt_fixup_ivybridge_ret: + + mov ecx, r10d + mov rax, rdi + mul rbp + movq xmm2, rdx + xor rdx, [rcx+rbx] + add r8, rdx + mov QWORD PTR [r14], r8 + xor r8, rdi + mov edi, r8d + and edi, 2097136 + movq xmm0, rax + xor rax, [rcx+rbx+8] + add r11, rax + mov QWORD PTR [r14+8], r11 + punpcklqdq xmm2, xmm0 + + mov r9d, r10d + xor r9d, 48 + xor r10d, 16 + pxor xmm2, XMMWORD PTR [r9+rbx] + movdqu xmm0, XMMWORD PTR [r10+rbx] + paddq xmm0, xmm5 + movdqu xmm1, XMMWORD PTR [rcx+rbx] + paddq xmm2, xmm4 + paddq xmm1, xmm7 + movdqa xmm5, xmm4 + movdqu XMMWORD PTR [r9+rbx], xmm0 + movdqa xmm4, xmm6 + movdqu XMMWORD PTR [rcx+rbx], xmm2 + movdqu XMMWORD PTR [r10+rbx], xmm1 + movdqu xmm6, [rdi+rbx] + mov r10d, edi + xor r11, r12 + dec rsi + jne main_loop_ivybridge + + ldmxcsr DWORD PTR [rsp] + mov rbx, QWORD PTR [rsp+160] + movaps xmm6, XMMWORD PTR [rsp+64] + movaps xmm7, XMMWORD PTR [rsp+48] + movaps xmm8, XMMWORD PTR [rsp+32] + add rsp, 80 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + pop rsi + pop rbp + jmp cnv2_main_loop_ivybridge_endp + +sqrt_fixup_ivybridge: + dec rdx + mov r13d, -1022 + shl r13, 32 + mov rax, rdx + shr rdx, 19 + shr rax, 20 + mov rcx, rdx + sub rcx, rax + add rax, r13 + not r13 + sub rcx, r13 + mov r13d, -2147483647 + imul rcx, rax + sub rcx, r9 + adc rdx, 0 + movq xmm3, rdx + jmp sqrt_fixup_ivybridge_ret + +cnv2_main_loop_ivybridge_endp: diff --git a/crypto/asm/cnv2_main_loop_ryzen.inc b/crypto/asm/cnv2_main_loop_ryzen.inc new file mode 100644 index 00000000..d386aa2d --- /dev/null +++ b/crypto/asm/cnv2_main_loop_ryzen.inc @@ -0,0 +1,179 @@ + mov QWORD PTR [rsp+16], rbx + mov QWORD PTR [rsp+24], rbp + mov QWORD PTR [rsp+32], rsi + push rdi + push r12 + push r13 + push r14 + push r15 + sub rsp, 64 + + stmxcsr DWORD PTR [rsp] + mov DWORD PTR [rsp+4], 24448 + ldmxcsr DWORD PTR [rsp+4] + + mov rax, QWORD PTR [rcx+48] + mov r9, rcx + xor rax, QWORD PTR [rcx+16] + mov ebp, 524288 + mov r8, QWORD PTR [rcx+32] + xor r8, QWORD PTR [rcx] + mov r11, QWORD PTR [rcx+40] + mov r10, r8 + mov rdx, QWORD PTR [rcx+56] + movq xmm3, rax + xor rdx, QWORD PTR [rcx+24] + xor r11, QWORD PTR [rcx+8] + mov rbx, QWORD PTR [rcx+224] + mov rax, QWORD PTR [r9+80] + xor rax, QWORD PTR [r9+64] + movq xmm0, rdx + mov rcx, QWORD PTR [rcx+88] + xor rcx, QWORD PTR [r9+72] + mov rdi, QWORD PTR [r9+104] + and r10d, 2097136 + movaps XMMWORD PTR [rsp+48], xmm6 + movq xmm4, rax + movaps XMMWORD PTR [rsp+32], xmm7 + movaps XMMWORD PTR [rsp+16], xmm8 + xorps xmm8, xmm8 + mov ax, 1023 + shl rax, 52 + movq xmm7, rax + mov r15, QWORD PTR [r9+96] + punpcklqdq xmm3, xmm0 + movq xmm0, rcx + punpcklqdq xmm4, xmm0 + + ALIGN 16 +main_loop_ryzen: + movdqa xmm5, XMMWORD PTR [r10+rbx] + movq xmm0, r11 + movq xmm6, r8 + punpcklqdq xmm6, xmm0 + lea rdx, QWORD PTR [r10+rbx] + lea r9, QWORD PTR [rdi+rdi] + shl rdi, 32 + + mov ecx, r10d + mov eax, r10d + xor ecx, 16 + xor eax, 32 + xor r10d, 48 + aesenc xmm5, xmm6 + movdqa xmm2, XMMWORD PTR [rcx+rbx] + movdqa xmm1, XMMWORD PTR [rax+rbx] + movdqa xmm0, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + paddq xmm0, xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm0 + movdqa XMMWORD PTR [rax+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movaps xmm1, xmm8 + mov rsi, r15 + xor rsi, rdi + movq r14, xmm5 + movdqa xmm0, xmm5 + pxor xmm0, xmm3 + mov r10, r14 + and r10d, 2097136 + movdqa XMMWORD PTR [rdx], xmm0 + xor rsi, QWORD PTR [r10+rbx] + lea r12, QWORD PTR [r10+rbx] + mov r13, QWORD PTR [r10+rbx+8] + + add r9d, r14d + or r9d, -2147483647 + xor edx, edx + movdqa xmm0, xmm5 + psrldq xmm0, 8 + movq rax, xmm0 + + div r9 + movq xmm0, rax + movq xmm1, rdx + punpckldq xmm0, xmm1 + movq r15, xmm0 + paddq xmm0, xmm5 + movdqa xmm2, xmm0 + psrlq xmm0, 12 + paddq xmm0, xmm7 + sqrtsd xmm1, xmm0 + movq rdi, xmm1 + test rdi, 524287 + je sqrt_fixup_ryzen + shr rdi, 19 + +sqrt_fixup_ryzen_ret: + mov rax, rsi + mul r14 + movq xmm1, rax + movq xmm0, rdx + punpcklqdq xmm0, xmm1 + + mov r9d, r10d + mov ecx, r10d + xor r9d, 16 + xor ecx, 32 + xor r10d, 48 + movdqa xmm1, XMMWORD PTR [rcx+rbx] + xor rdx, [rcx+rbx] + xor rax, [rcx+rbx+8] + movdqa xmm2, XMMWORD PTR [r9+rbx] + pxor xmm2, xmm0 + paddq xmm4, XMMWORD PTR [r10+rbx] + paddq xmm2, xmm3 + paddq xmm1, xmm6 + movdqa XMMWORD PTR [r9+rbx], xmm4 + movdqa XMMWORD PTR [rcx+rbx], xmm2 + movdqa XMMWORD PTR [r10+rbx], xmm1 + + movdqa xmm4, xmm3 + add r8, rdx + add r11, rax + mov QWORD PTR [r12], r8 + xor r8, rsi + mov QWORD PTR [r12+8], r11 + mov r10, r8 + xor r11, r13 + and r10d, 2097136 + movdqa xmm3, xmm5 + dec ebp + jne main_loop_ryzen + + ldmxcsr DWORD PTR [rsp] + movaps xmm6, XMMWORD PTR [rsp+48] + lea r11, QWORD PTR [rsp+64] + mov rbx, QWORD PTR [r11+56] + mov rbp, QWORD PTR [r11+64] + mov rsi, QWORD PTR [r11+72] + movaps xmm8, XMMWORD PTR [r11-48] + movaps xmm7, XMMWORD PTR [rsp+32] + mov rsp, r11 + pop r15 + pop r14 + pop r13 + pop r12 + pop rdi + jmp cnv2_main_loop_ryzen_endp + +sqrt_fixup_ryzen: + movq r9, xmm2 + dec rdi + mov edx, -1022 + shl rdx, 32 + mov rax, rdi + shr rdi, 19 + shr rax, 20 + mov rcx, rdi + sub rcx, rax + lea rcx, [rcx+rdx+1] + add rax, rdx + imul rcx, rax + sub rcx, r9 + adc rdi, 0 + jmp sqrt_fixup_ryzen_ret + +cnv2_main_loop_ryzen_endp: diff --git a/crypto/asm/win64/cnv2_main_loop.S b/crypto/asm/win64/cnv2_main_loop.S new file mode 100644 index 00000000..78eb1185 --- /dev/null +++ b/crypto/asm/win64/cnv2_main_loop.S @@ -0,0 +1,21 @@ +#define ALIGN .align +.intel_syntax noprefix +.section .text +.global cnv2_mainloop_ivybridge_asm +.global cnv2_mainloop_ryzen_asm +.global cnv2_double_mainloop_sandybridge_asm + +ALIGN 16 +cnv2_mainloop_ivybridge_asm: + #include "../cnv2_main_loop_ivybridge.inc" + ret 0 + +ALIGN 16 +cnv2_mainloop_ryzen_asm: + #include "../cnv2_main_loop_ryzen.inc" + ret 0 + +ALIGN 16 +cnv2_double_mainloop_sandybridge_asm: + #include "../cnv2_double_main_loop_sandybridge.inc" + ret 0 diff --git a/options.c b/options.c index 41921784..f276b3f1 100644 --- a/options.c +++ b/options.c @@ -54,9 +54,10 @@ char *opt_userpass = NULL; char *opt_user = NULL; char *opt_pass = NULL; -enum Algo opt_algo = ALGO_CRYPTONIGHT; -enum Variant opt_variant = VARIANT_AUTO; -enum AlgoVariant opt_av = AV_AUTO; +enum Algo opt_algo = ALGO_CRYPTONIGHT; +enum Variant opt_variant = VARIANT_AUTO; +enum AlgoVariant opt_av = AV_AUTO; +enum Assembly opt_assembly = ASM_AUTO; struct AlgoData @@ -137,6 +138,7 @@ static struct option const options[] = { { "userpass", 1, NULL, 'O' }, { "version", 0, NULL, 'V' }, { "variant", 1, NULL, 1021 }, + { "asm", 1, NULL, 1022 }, { NULL, 0, NULL, 0 } }; @@ -157,13 +159,21 @@ static const char *variant_names[] = { }; +static const char *asm_names[] = { + "none", + "auto", + "intel", + "ryzen" +}; + + #ifndef XMRIG_NO_AEON static int get_cryptonight_lite_variant(int variant) { - if (variant <= AEON_AV0_AUTO || variant >= AEON_AV_MAX) { - return (cpu_info.flags & CPU_FLAG_AES) ? AEON_AV2_AESNI_DOUBLE : AEON_AV4_SOFT_AES_DOUBLE; + if (variant <= AV_AUTO || variant >= AV_MAX) { + return (cpu_info.flags & CPU_FLAG_AES) ? AV_DOUBLE : AV_DOUBLE_SOFT; } - if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AEON_AV2_AESNI_DOUBLE) { + if (opt_safe && !(cpu_info.flags & CPU_FLAG_AES) && variant <= AV_DOUBLE) { return variant + 2; } @@ -212,6 +222,14 @@ static void parse_arg(int key, char *arg) { } break; + case 1022: /* --asm */ + for (size_t i = 0; i < ARRAY_SIZE(asm_names); i++) { + if (strcasecmp(arg, asm_names[i]) == 0) { + opt_assembly = i; + } + } + break; + case 'O': /* --userpass */ p = strchr(arg, ':'); if (!p) { diff --git a/options.h b/options.h index 4f543275..7117130b 100644 --- a/options.h +++ b/options.h @@ -27,6 +27,7 @@ #include #include + #ifndef ARRAY_SIZE # define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) #endif @@ -57,16 +58,13 @@ enum AlgoVariant { }; -#ifndef XMRIG_NO_AEON -enum aeon_algo_variant { - AEON_AV0_AUTO, - AEON_AV1_AESNI, - AEON_AV2_AESNI_DOUBLE, - AEON_AV3_SOFT_AES, - AEON_AV4_SOFT_AES_DOUBLE, - AEON_AV_MAX +enum Assembly { + ASM_NONE, + ASM_AUTO, + ASM_INTEL, + ASM_RYZEN, + ASM_MAX }; -#endif extern bool opt_colors; @@ -90,6 +88,7 @@ extern int64_t opt_affinity; extern enum Algo opt_algo; extern enum Variant opt_variant; extern enum AlgoVariant opt_av; +extern enum Assembly opt_assembly; void parse_cmdline(int argc, char *argv[]); void show_usage_and_exit(int status); From b02f4ff1633be8c40bf6ee6a0c1c2f69e9cf3ea0 Mon Sep 17 00:00:00 2001 From: XMRig Date: Fri, 5 Oct 2018 15:58:33 +0300 Subject: [PATCH 14/16] Autodetect ASM without libcpuid. --- cpu_stub.c | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/cpu_stub.c b/cpu_stub.c index 83d5efc3..7e1b4209 100644 --- a/cpu_stub.c +++ b/cpu_stub.c @@ -24,7 +24,11 @@ #include #include #include +#include + + #include "cpu.h" +#include "options.h" #define VENDOR_ID (0) @@ -53,7 +57,7 @@ static inline void cpuid(int level, int output[4]) { static void cpu_brand_string(char* s) { - int cpu_info[4] = { 0 }; + int32_t cpu_info[4] = { 0 }; cpuid(VENDOR_ID, cpu_info); if (cpu_info[EAX_Reg] >= 4) { @@ -68,7 +72,7 @@ static void cpu_brand_string(char* s) { static bool has_aes_ni() { - int cpu_info[4] = { 0 }; + int32_t cpu_info[4] = { 0 }; cpuid(PROCESSOR_INFO, cpu_info); return cpu_info[ECX_Reg] & bit_AES; @@ -76,7 +80,7 @@ static bool has_aes_ni() static bool has_bmi2() { - int cpu_info[4] = { 0 }; + int32_t cpu_info[4] = { 0 }; cpuid(EXTENDED_FEATURES, cpu_info); return cpu_info[EBX_Reg] & bit_BMI2; @@ -93,6 +97,24 @@ void cpu_init_common() { if (has_aes_ni()) { cpu_info.flags |= CPU_FLAG_AES; + +# ifndef XMRIG_NO_ASM + char vendor[13] = { 0 }; + int32_t data[4] = { 0 }; + + cpuid(0, data); + + memcpy(vendor + 0, &data[1], 4); + memcpy(vendor + 4, &data[3], 4); + memcpy(vendor + 8, &data[2], 4); + + if (memcmp(vendor, "GenuineIntel", 12) == 0) { + cpu_info.assembly = ASM_INTEL; + } + else if (memcmp(vendor, "AuthenticAMD", 12) == 0) { + cpu_info.assembly = ASM_RYZEN; + } +# endif } if (has_bmi2()) { From b9096f2392b61767b754dfe8e6db421466d7a665 Mon Sep 17 00:00:00 2001 From: XMRig Date: Fri, 5 Oct 2018 16:01:22 +0300 Subject: [PATCH 15/16] Disable donation. --- donate.h | 2 +- options.c | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/donate.h b/donate.h index 4fc60787..fec60439 100644 --- a/donate.h +++ b/donate.h @@ -24,6 +24,6 @@ #ifndef __DONATE_H__ #define __DONATE_H__ -#define DONATE_LEVEL 5 +#define DONATE_LEVEL 0 #endif /* __DONATE_H__ */ diff --git a/options.c b/options.c index f276b3f1..8fe9bff8 100644 --- a/options.c +++ b/options.c @@ -370,12 +370,12 @@ static void parse_arg(int key, char *arg) { break; case 1003: /* --donate-level */ - v = atoi(arg); - if (v < 1 || v > 99) { - show_usage_and_exit(1); - } +// v = atoi(arg); +// if (v < 1 || v > 99) { +// show_usage_and_exit(1); +// } - opt_donate_level = v; +// opt_donate_level = v; break; case 1021: /* --variant */ From 302ebe5a5b9f4d08196b42ffd000032ade3d4aa8 Mon Sep 17 00:00:00 2001 From: xmrig Date: Fri, 5 Oct 2018 16:22:16 +0300 Subject: [PATCH 16/16] Update CHANGELOG.md --- CHANGELOG.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2005d74d..d58da6e1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,12 @@ +# v0.9.0 +- **[#753](https://github.com/xmrig/xmrig/issues/753) Added new algorithm [CryptoNight variant 2](https://github.com/xmrig/xmrig/issues/753) for Monero fork, thanks [@SChernykh](https://github.com/SChernykh).** + - Added option `--asm`, possible values `--asm auto`, `--asm none`, `--asm intel` and `--asm ryzen`. +- Added support for new style long and short algorithm names, possible values: `cryptonight`, `cryptonight/0`, `cryptonight/1`, `cryptonight/2`, `cryptonight-lite`, `cryptonight-lite/0`, `cryptonight-lite/1` and short equvalents `cn/2` etc. +- Added `--variant`, example `--algo cn --variant 2`, by default miner automaticaly detect proper variant for Monero by block version. +- Added CryptoNight-Lite variant 1. +- Added xmrig-proxy autodetection, nicehash will be enabled automaticaly. +- Added workaround for xmrig-proxy [bug](https://github.com/xmrig/xmrig-proxy/commit/dfa1960fe3eeb13f80717b7dbfcc7c6e9f222d89). + # v0.8.2 - Fixed L2 cache size detection for AMD CPUs (Bulldozer/Piledriver/Steamroller/Excavator architecture). - Fixed gcc 7.1 support.