From 90699d58ec347bffdcfed439dfbeea2b3d8ddbce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ben=20Gr=C3=A4f?= Date: Tue, 26 Jun 2018 20:25:38 +0200 Subject: [PATCH] Features of 1.6.5 (#140) * Hashrate improve -> add autodetection mode for cpu-affinity * Hashrate improve, more stable hashrates -> refactor memory allocation * Add TubeV4 support (cn-heavy + ipbc mod + soft-aes mod) * Update ccp-httpd lib to fix stop/freeze of cc communication on some miners * Fix cn-heavy on arm processors --- src/3rdparty/cpp-httplib/httplib.h | 3164 +++++++++++++------- src/App.cpp | 25 +- src/App_unix.cpp | 4 - src/App_win.cpp | 4 - src/Cpu.cpp | 25 +- src/Cpu.h | 3 +- src/CpuImpl.h | 3 +- src/Cpu_mac.cpp | 24 +- src/Cpu_unix.cpp | 44 +- src/Cpu_win.cpp | 38 +- src/Mem.cpp | 49 +- src/Mem.h | 47 +- src/Mem_unix.cpp | 83 +- src/Mem_win.cpp | 87 +- src/Options.cpp | 8 +- src/PowVariant.h | 10 +- src/Summary.cpp | 20 +- src/api/ApiState.cpp | 2 +- src/cc/CCClient.cpp | 5 +- src/crypto/CryptoNight.cpp | 176 +- src/crypto/CryptoNight.h | 6 +- src/crypto/CryptoNight_arm.h | 3366 +++++++++++++--------- src/crypto/CryptoNight_test.h | 12 +- src/crypto/CryptoNight_x86.h | 1254 +++++--- src/crypto/SSE2NEON.h | 6 + src/crypto/soft_aes.h | 25 +- src/log/Log.h | 13 + src/version.h | 4 +- src/workers/Handle.cpp | 2 +- src/workers/Handle.h | 10 +- src/workers/MultiWorker.cpp | 48 +- src/workers/MultiWorker.h | 2 +- src/workers/Worker.cpp | 5 +- src/workers/Worker.h | 6 +- src/workers/Workers.cpp | 9 +- src/workers/Workers.h | 2 +- test/cryptonight/cryptonight.c | 20 +- test/cryptonight_lite/cryptonight_lite.c | 28 +- 38 files changed, 5525 insertions(+), 3114 deletions(-) diff --git a/src/3rdparty/cpp-httplib/httplib.h b/src/3rdparty/cpp-httplib/httplib.h index d3c46c7c..87171e0b 100644 --- a/src/3rdparty/cpp-httplib/httplib.h +++ b/src/3rdparty/cpp-httplib/httplib.h @@ -2,30 +2,31 @@ // httplib.h // // Copyright (c) 2017 Yuji Hirose. All rights reserved. -// The Boost Software License 1.0 +// MIT License // #ifndef _CPPHTTPLIB_HTTPLIB_H_ #define _CPPHTTPLIB_HTTPLIB_H_ -#ifdef _MSC_VER +#ifdef _WIN32 +#ifndef _CRT_SECURE_NO_WARNINGS #define _CRT_SECURE_NO_WARNINGS +#endif +#ifndef _CRT_NONSTDC_NO_DEPRECATE #define _CRT_NONSTDC_NO_DEPRECATE +#endif -#ifndef SO_SYNCHRONOUS_NONALERT -#define SO_SYNCHRONOUS_NONALERT 0x20 -#endif -#ifndef SO_OPENTYPE -#define SO_OPENTYPE 0x7008 -#endif -#if (_MSC_VER < 1900) +#if defined(_MSC_VER) && _MSC_VER < 1900 #define snprintf _snprintf_s #endif +#ifndef S_ISREG #define S_ISREG(m) (((m)&S_IFREG)==S_IFREG) +#endif +#ifndef S_ISDIR #define S_ISDIR(m) (((m)&S_IFDIR)==S_IFDIR) +#endif -#include #include #include #include @@ -33,717 +34,1238 @@ #undef min #undef max +#ifndef strcasecmp +#define strcasecmp _stricmp +#endif + typedef SOCKET socket_t; #else #include #include - -#if WIN32 -#include -#else #include +#include #include #include -#include -#endif - -#include #include +#include +#include typedef int socket_t; +#define INVALID_SOCKET (-1) #endif #include #include #include #include +#include #include #include +#include #include +#include #include #ifdef CPPHTTPLIB_OPENSSL_SUPPORT #include #endif +#ifdef CPPHTTPLIB_ZLIB_SUPPORT +#include +#endif + +/* + * Configuration + */ +#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5 +#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND 0 + namespace httplib { -typedef std::map Map; -typedef std::multimap MultiMap; -typedef std::smatch Match; + namespace detail { -struct Request { - std::string method; - std::string path; - MultiMap headers; - std::string body; - Map params; - Match matches; + struct ci { + bool operator() (const std::string & s1, const std::string & s2) const { + return std::lexicographical_compare( + s1.begin(), s1.end(), + s2.begin(), s2.end(), + [](char c1, char c2) { + return ::tolower(c1) < ::tolower(c2); + }); + } + }; - bool has_header(const char* key) const; - std::string get_header_value(const char* key) const; - void set_header(const char* key, const char* val); + } // namespace detail - bool has_param(const char* key) const; -}; + enum class HttpVersion { v1_0 = 0, v1_1 }; -struct Response { - int status; - MultiMap headers; - std::string body; + typedef std::multimap Headers; - bool has_header(const char* key) const; - std::string get_header_value(const char* key) const; - void set_header(const char* key, const char* val); + template + std::pair make_range_header(uint64_t value, Args... args); - void set_redirect(const char* url); - void set_content(const char* s, size_t n, const char* content_type); - void set_content(const std::string& s, const char* content_type); + typedef std::multimap Params; + typedef std::smatch Match; + typedef std::function Progress; - Response() : status(-1) {} -}; + struct MultipartFile { + std::string filename; + std::string content_type; + size_t offset = 0; + size_t length = 0; + }; + typedef std::multimap MultipartFiles; -class Stream { -public: - virtual ~Stream() {} - virtual int read(char* ptr, size_t size) = 0; - virtual int write(const char* ptr, size_t size1) = 0; - virtual int write(const char* ptr) = 0; -}; + struct Request { + std::string version; + std::string method; + std::string target; + std::string path; + Headers headers; + std::string body; + Params params; + MultipartFiles files; + Match matches; -class SocketStream : public Stream { -public: - SocketStream(socket_t sock); - virtual ~SocketStream(); + Progress progress; - virtual int read(char* ptr, size_t size); - virtual int write(const char* ptr, size_t size); - virtual int write(const char* ptr); + bool has_header(const char* key) const; + std::string get_header_value(const char* key) const; + void set_header(const char* key, const char* val); -private: - socket_t sock_; -}; + bool has_param(const char* key) const; + std::string get_param_value(const char* key) const; -class Server { -public: - typedef std::function Handler; - typedef std::function Logger; + bool has_file(const char* key) const; + MultipartFile get_file_value(const char* key) const; + }; - Server(); - virtual ~Server(); + struct Response { + std::string version; + int status; + Headers headers; + std::string body; - void get(const char* pattern, Handler handler); - void post(const char* pattern, Handler handler); + bool has_header(const char* key) const; + std::string get_header_value(const char* key) const; + void set_header(const char* key, const char* val); - bool set_base_dir(const char* path); + void set_redirect(const char* uri); + void set_content(const char* s, size_t n, const char* content_type); + void set_content(const std::string& s, const char* content_type); - void set_error_handler(Handler handler); - void set_logger(Logger logger); + Response() : status(-1) {} + }; - bool listen(const char* host, int port, int socket_flags = 0); - void stop(); + class Stream { + public: + virtual ~Stream() {} + virtual int read(char* ptr, size_t size) = 0; + virtual int write(const char* ptr, size_t size1) = 0; + virtual int write(const char* ptr) = 0; + virtual std::string get_remote_addr() = 0; -protected: - void process_request(Stream& strm); + template + void write_format(const char* fmt, const Args& ...args); + }; -private: - typedef std::vector> Handlers; + class SocketStream : public Stream { + public: + SocketStream(socket_t sock); + virtual ~SocketStream(); - bool routing(Request& req, Response& res); - bool handle_file_request(Request& req, Response& res); - bool dispatch_request(Request& req, Response& res, Handlers& handlers); + virtual int read(char* ptr, size_t size); + virtual int write(const char* ptr, size_t size); + virtual int write(const char* ptr); + virtual std::string get_remote_addr(); - bool read_request_line(Stream& strm, Request& req); + private: + socket_t sock_; + }; - virtual bool read_and_close_socket(socket_t sock); + class Server { + public: + typedef std::function Handler; + typedef std::function Logger; - socket_t svr_sock_; - std::string base_dir_; - Handlers get_handlers_; - Handlers post_handlers_; - Handler error_handler_; - Logger logger_; -}; + Server(); -class Client { -public: - Client(const char* host, int port); - virtual ~Client(); + virtual ~Server(); - std::shared_ptr get(const char* path); - std::shared_ptr head(const char* path); - std::shared_ptr post(const char* path, const std::string& body, const char* content_type); - std::shared_ptr post(const char* path, const Map& params); + virtual bool is_valid() const; - bool send(const Request& req, Response& res); + Server& Get(const char* pattern, Handler handler); + Server& Post(const char* pattern, Handler handler); -protected: - bool process_request(Stream& strm, const Request& req, Response& res); + Server& Put(const char* pattern, Handler handler); + Server& Delete(const char* pattern, Handler handler); + Server& Options(const char* pattern, Handler handler); - const std::string host_; - const int port_; - const std::string host_and_port_; + bool set_base_dir(const char* path); -private: - bool read_response_line(Stream& strm, Response& res); - void add_default_headers(Request& req); + void set_error_handler(Handler handler); + void set_logger(Logger logger); - virtual bool read_and_close_socket(socket_t sock, const Request& req, Response& res); -}; + void set_keep_alive_max_count(size_t count); + + int bind_to_any_port(const char* host, int socket_flags = 0); + bool listen_after_bind(); + + bool listen(const char* host, int port, int socket_flags = 0); + + bool is_running() const; + void stop(); + + protected: + bool process_request(Stream& strm, bool last_connection, bool& connection_close); + + size_t keep_alive_max_count_; + + private: + typedef std::vector> Handlers; + + socket_t create_server_socket(const char* host, int port, int socket_flags) const; + int bind_internal(const char* host, int port, int socket_flags); + bool listen_internal(); + + bool routing(Request& req, Response& res); + bool handle_file_request(Request& req, Response& res); + bool dispatch_request(Request& req, Response& res, Handlers& handlers); + + bool parse_request_line(const char* s, Request& req); + void write_response(Stream& strm, bool last_connection, const Request& req, Response& res); + + virtual bool read_and_close_socket(socket_t sock); + + bool is_running_; + socket_t svr_sock_; + std::string base_dir_; + Handlers get_handlers_; + Handlers post_handlers_; + Handlers put_handlers_; + Handlers delete_handlers_; + Handlers options_handlers_; + Handler error_handler_; + Logger logger_; + + // TODO: Use thread pool... + std::mutex running_threads_mutex_; + int running_threads_; + }; + + class Client { + public: + Client( + const char* host, + int port = 80, + size_t timeout_sec = 300); + + virtual ~Client(); + + virtual bool is_valid() const; + + std::shared_ptr Get(const char* path, Progress progress = nullptr); + std::shared_ptr Get(const char* path, const Headers& headers, Progress progress = nullptr); + + std::shared_ptr Head(const char* path); + std::shared_ptr Head(const char* path, const Headers& headers); + + std::shared_ptr Post(const char* path, const std::string& body, const char* content_type); + std::shared_ptr Post(const char* path, const Headers& headers, const std::string& body, const char* content_type); + + std::shared_ptr Post(const char* path, const Params& params); + std::shared_ptr Post(const char* path, const Headers& headers, const Params& params); + + std::shared_ptr Put(const char* path, const std::string& body, const char* content_type); + std::shared_ptr Put(const char* path, const Headers& headers, const std::string& body, const char* content_type); + + std::shared_ptr Delete(const char* path); + std::shared_ptr Delete(const char* path, const Headers& headers); + + std::shared_ptr Options(const char* path); + std::shared_ptr Options(const char* path, const Headers& headers); + + bool send(Request& req, Response& res); + + protected: + bool process_request(Stream& strm, Request& req, Response& res, bool& connection_close); + + const std::string host_; + const int port_; + size_t timeout_sec_; + const std::string host_and_port_; + + private: + socket_t create_client_socket() const; + bool read_response_line(Stream& strm, Response& res); + void write_request(Stream& strm, Request& req); + + virtual bool read_and_close_socket(socket_t sock, Request& req, Response& res); + }; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT -class SSLSocketStream : public Stream { -public: - SSLSocketStream(SSL* ssl); - virtual ~SSLSocketStream(); + class SSLSocketStream : public Stream { + public: + SSLSocketStream(socket_t sock, SSL* ssl); + virtual ~SSLSocketStream(); - virtual int read(char* ptr, size_t size); - virtual int write(const char* ptr, size_t size); - virtual int write(const char* ptr); + virtual int read(char* ptr, size_t size); + virtual int write(const char* ptr, size_t size); + virtual int write(const char* ptr); + virtual std::string get_remote_addr(); -private: - SSL* ssl_; -}; + private: + socket_t sock_; + SSL* ssl_; + }; -class SSLServer : public Server { -public: - SSLServer(const char* cert_path, const char* private_key_path); - virtual ~SSLServer(); + class SSLServer : public Server { + public: + SSLServer( + const char* cert_path, const char* private_key_path); -private: - virtual bool read_and_close_socket(socket_t sock); + virtual ~SSLServer(); - SSL_CTX* ctx_; -}; + virtual bool is_valid() const; -class SSLClient : public Client { -public: - SSLClient(const char* host, int port); - virtual ~SSLClient(); + private: + virtual bool read_and_close_socket(socket_t sock); -private: - virtual bool read_and_close_socket(socket_t sock, const Request& req, Response& res); + SSL_CTX* ctx_; + std::mutex ctx_mutex_; + }; - SSL_CTX* ctx_; -}; + class SSLClient : public Client { + public: + SSLClient( + const char* host, + int port = 80, + size_t timeout_sec = 300); + + virtual ~SSLClient(); + + virtual bool is_valid() const; + + private: + virtual bool read_and_close_socket(socket_t sock, Request& req, Response& res); + + SSL_CTX* ctx_; + std::mutex ctx_mutex_; + }; #endif /* * Implementation */ -namespace detail { + namespace detail { -template -void split(const char* b, const char* e, char d, Fn fn) -{ - int i = 0; - int beg = 0; + template + void split(const char* b, const char* e, char d, Fn fn) + { + int i = 0; + int beg = 0; - while (e ? (b + i != e) : (b[i] != '\0')) { - if (b[i] == d) { - fn(&b[beg], &b[i]); - beg = i + 1; - } - i++; - } + while (e ? (b + i != e) : (b[i] != '\0')) { + if (b[i] == d) { + fn(&b[beg], &b[i]); + beg = i + 1; + } + i++; + } - if (i) { - fn(&b[beg], &b[i]); - } -} - -inline bool socket_gets(Stream& strm, char* buf, int bufsiz) -{ - // TODO: buffering for better performance - size_t i = 0; - - for (;;) { - char byte; - auto n = strm.read(&byte, 1); - - if (n < 1) { - if (i == 0) { - return false; - } else { - break; + if (i) { + fn(&b[beg], &b[i]); } } - buf[i++] = byte; +// NOTE: until the read size reaches `fixed_buffer_size`, use `fixed_buffer` +// to store data. The call can set memory on stack for performance. + class stream_line_reader { + public: + stream_line_reader(Stream& strm, char* fixed_buffer, size_t fixed_buffer_size) + : strm_(strm) + , fixed_buffer_(fixed_buffer) + , fixed_buffer_size_(fixed_buffer_size) { + } - if (byte == '\n') { - break; - } - } + const char* ptr() const { + if (glowable_buffer_.empty()) { + return fixed_buffer_; + } else { + return glowable_buffer_.data(); + } + } - buf[i] = '\0'; - return true; -} + bool getline() { + fixed_buffer_used_size_ = 0; + glowable_buffer_.clear(); -template -inline void socket_printf(Stream& strm, const char* fmt, const Args& ...args) -{ - char buf[BUFSIZ]; - auto n = snprintf(buf, BUFSIZ, fmt, args...); - if (n > 0) { - if (n >= BUFSIZ) { - // TODO: buffer size is not large enough... - } else { - strm.write(buf, n); - } - } -} + for (size_t i = 0; ; i++) { + char byte; + auto n = strm_.read(&byte, 1); -inline int close_socket(socket_t sock) -{ -#if defined(_MSC_VER) || defined(WIN32) - return closesocket(sock); + if (n < 0) { + return false; + } else if (n == 0) { + if (i == 0) { + return false; + } else { + break; + } + } + + append(byte); + + if (byte == '\n') { + break; + } + } + + return true; + } + + private: + void append(char c) { + if (fixed_buffer_used_size_ < fixed_buffer_size_ - 1) { + fixed_buffer_[fixed_buffer_used_size_++] = c; + fixed_buffer_[fixed_buffer_used_size_] = '\0'; + } else { + if (glowable_buffer_.empty()) { + assert(fixed_buffer_[fixed_buffer_used_size_] == '\0'); + glowable_buffer_.assign(fixed_buffer_, fixed_buffer_used_size_); + } + glowable_buffer_ += c; + } + } + + Stream& strm_; + char* fixed_buffer_; + const size_t fixed_buffer_size_; + size_t fixed_buffer_used_size_; + std::string glowable_buffer_; + }; + + inline int close_socket(socket_t sock) + { +#ifdef _WIN32 + return closesocket(sock); #else - return close(sock); + return close(sock); #endif -} + } -template -inline bool read_and_close_socket(socket_t sock, T callback) -{ - SocketStream strm(sock); - auto ret = callback(strm); - close_socket(sock); - return ret; -} + inline int select_read(socket_t sock, size_t sec, size_t usec) + { + fd_set fds; + FD_ZERO(&fds); + FD_SET(sock, &fds); -inline int shutdown_socket(socket_t sock) -{ -#if defined(_MSC_VER) || defined(WIN32) - return shutdown(sock, SD_BOTH); + timeval tv; + tv.tv_sec = sec; + tv.tv_usec = usec; + + return select(sock + 1, &fds, NULL, NULL, &tv); + } + + inline bool wait_until_socket_is_ready(socket_t sock, size_t sec, size_t usec) + { + fd_set fdsr; + FD_ZERO(&fdsr); + FD_SET(sock, &fdsr); + + auto fdsw = fdsr; + auto fdse = fdsr; + + timeval tv; + tv.tv_sec = sec; + tv.tv_usec = usec; + + if (select(sock + 1, &fdsr, &fdsw, &fdse, &tv) < 0) { + return false; + } else if (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw)) { + int error = 0; + socklen_t len = sizeof(error); + if (getsockopt(sock, SOL_SOCKET, SO_ERROR, (char*)&error, &len) < 0 || error) { + return false; + } + } else { + return false; + } + + return true; + } + + template + inline bool read_and_close_socket(socket_t sock, size_t keep_alive_max_count, T callback) + { + bool ret = false; + + if (keep_alive_max_count > 0) { + auto count = keep_alive_max_count; + while (count > 0 && + detail::select_read(sock, + CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND, + CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND) > 0) { + SocketStream strm(sock); + auto last_connection = count == 1; + auto connection_close = false; + + ret = callback(strm, last_connection, connection_close); + if (!ret || connection_close) { + break; + } + + count--; + } + } else { + SocketStream strm(sock); + auto dummy_connection_close = false; + ret = callback(strm, true, dummy_connection_close); + } + + close_socket(sock); + return ret; + } + + inline int shutdown_socket(socket_t sock) + { +#ifdef _WIN32 + return shutdown(sock, SD_BOTH); #else - return shutdown(sock, SHUT_RDWR); + return shutdown(sock, SHUT_RDWR); #endif -} + } + + template + socket_t create_socket(const char* host, int port, Fn fn, int socket_flags = 0) + { +#ifdef _WIN32 + #define SO_SYNCHRONOUS_NONALERT 0x20 +#define SO_OPENTYPE 0x7008 -template -socket_t create_socket(const char* host, int port, Fn fn, int socket_flags = 0) -{ -#if defined(_MSC_VER) || defined(WIN32) int opt = SO_SYNCHRONOUS_NONALERT; setsockopt(INVALID_SOCKET, SOL_SOCKET, SO_OPENTYPE, (char*)&opt, sizeof(opt)); #endif - // Get address info - struct addrinfo hints; - struct addrinfo *result; + // Get address info + struct addrinfo hints; + struct addrinfo *result; - memset(&hints, 0, sizeof(struct addrinfo)); - hints.ai_family = AF_UNSPEC; - hints.ai_socktype = SOCK_STREAM; - hints.ai_flags = socket_flags; - hints.ai_protocol = 0; + memset(&hints, 0, sizeof(struct addrinfo)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = socket_flags; + hints.ai_protocol = 0; - auto service = std::to_string(port); + auto service = std::to_string(port); - if (getaddrinfo(host, service.c_str(), &hints, &result)) { - return -1; - } - - for (auto rp = result; rp; rp = rp->ai_next) { - // Create a socket - auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); - if (sock == -1) { - continue; - } - - // Make 'reuse address' option available - int yes = 1; - setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&yes, sizeof(yes)); - - struct timeval timeout; - timeout.tv_sec = 10; - timeout.tv_usec = 0; - - setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); - setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); - - // bind or connect - if (fn(sock, *rp)) { - freeaddrinfo(result); - return sock; - } - - close_socket(sock); - } - - freeaddrinfo(result); - return -1; -} - -inline socket_t create_server_socket(const char* host, int port, int socket_flags) -{ - return create_socket(host, port, [](socket_t sock, struct addrinfo& ai) -> socket_t { - if (::bind(sock, ai.ai_addr, ai.ai_addrlen)) { - return false; - } - if (listen(sock, 5)) { // Listen through 5 channels - return false; - } - return true; - }, socket_flags); -} - -inline socket_t create_client_socket(const char* host, int port) -{ - return create_socket(host, port, [](socket_t sock, struct addrinfo& ai) -> socket_t { - if (connect(sock, ai.ai_addr, ai.ai_addrlen)) { - return false; - } - return true; - }); -} - -inline bool is_file(const std::string& s) -{ - struct stat st; - return stat(s.c_str(), &st) >= 0 && S_ISREG(st.st_mode); -} - -inline bool is_dir(const std::string& s) -{ - struct stat st; - return stat(s.c_str(), &st) >= 0 && S_ISDIR(st.st_mode); -} - -inline void read_file(const std::string& path, std::string& out) -{ - std::ifstream fs(path, std::ios_base::binary); - fs.seekg(0, std::ios_base::end); - auto size = fs.tellg(); - fs.seekg(0); - out.resize(static_cast(size)); - fs.read(&out[0], size); -} - -inline std::string file_extension(const std::string& path) -{ - std::smatch m; - auto pat = std::regex("\\.([a-zA-Z0-9]+)$"); - if (std::regex_search(path, m, pat)) { - return m[1].str(); - } - return std::string(); -} - -inline const char* content_type(const std::string& path) -{ - auto ext = detail::file_extension(path); - if (ext == "txt") { - return "text/plain"; - } else if (ext == "html") { - return "text/html"; - } else if (ext == "js") { - return "text/javascript"; - } else if (ext == "css") { - return "text/css"; - } else if (ext == "xml") { - return "text/xml"; - } else if (ext == "jpeg" || ext == "jpg") { - return "image/jpg"; - } else if (ext == "png") { - return "image/png"; - } else if (ext == "gif") { - return "image/gif"; - } else if (ext == "svg") { - return "image/svg+xml"; - } else if (ext == "ico") { - return "image/x-icon"; - } else if (ext == "json") { - return "application/json"; - } else if (ext == "pdf") { - return "application/pdf"; - } else if (ext == "xhtml") { - return "application/xhtml+xml"; - } - return nullptr; -} - -inline const char* status_message(int status) -{ - switch (status) { - case 200: return "OK"; - case 400: return "Bad Request"; - case 404: return "Not Found"; - default: - case 500: return "Internal Server Error"; - } -} - -inline const char* get_header_value(const MultiMap& map, const char* key, const char* def) -{ - auto it = map.find(key); - if (it != map.end()) { - return it->second.c_str(); - } - return def; -} - -inline int get_header_value_int(const MultiMap& map, const char* key, int def) -{ - auto it = map.find(key); - if (it != map.end()) { - return std::stoi(it->second); - } - return def; -} - -inline bool read_headers(Stream& strm, MultiMap& headers) -{ - static std::regex re("(.+?): (.+?)\r\n"); - - const auto BUFSIZ_HEADER = 2048; - char buf[BUFSIZ_HEADER]; - - for (;;) { - if (!socket_gets(strm, buf, BUFSIZ_HEADER)) { - return false; - } - if (!strcmp(buf, "\r\n")) { - break; - } - std::cmatch m; - if (std::regex_match(buf, m, re)) { - auto key = std::string(m[1]); - auto val = std::string(m[2]); - headers.insert(std::make_pair(key, val)); - } - } - - return true; -} - -template -bool read_content(Stream& strm, T& x, bool allow_no_content_length) -{ - auto len = get_header_value_int(x.headers, "Content-Length", 0); - if (len) { - x.body.assign(len, 0); - auto r = 0; - while (r < len){ - auto r_incr = strm.read(&x.body[r], len - r); - if (r_incr <= 0) { - return false; + if (getaddrinfo(host, service.c_str(), &hints, &result)) { + return INVALID_SOCKET; } - r += r_incr; + + for (auto rp = result; rp; rp = rp->ai_next) { + // Create a socket + auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); + if (sock == INVALID_SOCKET) { + continue; + } + + // Make 'reuse address' option available + int yes = 1; + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&yes, sizeof(yes)); + + // Make socket also having a timeout + struct timeval timeout; + timeout.tv_sec = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND; + timeout.tv_usec = CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND; + + setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout)); + setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout)); + + // bind or connect + if (fn(sock, *rp)) { + freeaddrinfo(result); + return sock; + } + + close_socket(sock); + } + + freeaddrinfo(result); + return INVALID_SOCKET; } - } else if (allow_no_content_length) { - for (;;) { - char byte; - auto n = strm.read(&byte, 1); - if (n < 1) { - if (x.body.size() == 0) { - return true; // no body + + inline void set_nonblocking(socket_t sock, bool nonblocking) + { +#ifdef _WIN32 + auto flags = nonblocking ? 1UL : 0UL; + ioctlsocket(sock, FIONBIO, &flags); +#else + auto flags = fcntl(sock, F_GETFL, 0); + fcntl(sock, F_SETFL, nonblocking ? (flags | O_NONBLOCK) : (flags & (~O_NONBLOCK))); +#endif + } + + inline bool is_connection_error() + { +#ifdef _WIN32 + return WSAGetLastError() != WSAEWOULDBLOCK; +#else + return errno != EINPROGRESS; +#endif + } + + inline std::string get_remote_addr(socket_t sock) { + struct sockaddr_storage addr; + socklen_t len = sizeof(addr); + + if (!getpeername(sock, (struct sockaddr*)&addr, &len)) { + char ipstr[NI_MAXHOST]; + + if (!getnameinfo((struct sockaddr*)&addr, len, + ipstr, sizeof(ipstr), nullptr, 0, NI_NUMERICHOST)) { + return ipstr; + } + } + + return std::string(); + } + + inline bool is_file(const std::string& path) + { + struct stat st; + return stat(path.c_str(), &st) >= 0 && S_ISREG(st.st_mode); + } + + inline bool is_dir(const std::string& path) + { + struct stat st; + return stat(path.c_str(), &st) >= 0 && S_ISDIR(st.st_mode); + } + + inline bool is_valid_path(const std::string& path) { + size_t level = 0; + size_t i = 0; + + // Skip slash + while (i < path.size() && path[i] == '/') { + i++; + } + + while (i < path.size()) { + // Read component + auto beg = i; + while (i < path.size() && path[i] != '/') { + i++; + } + + auto len = i - beg; + assert(len > 0); + + if (!path.compare(beg, len, ".")) { + ; + } else if (!path.compare(beg, len, "..")) { + if (level == 0) { + return false; + } + level--; } else { + level++; + } + + // Skip slash + while (i < path.size() && path[i] == '/') { + i++; + } + } + + return true; + } + + inline void read_file(const std::string& path, std::string& out) + { + std::ifstream fs(path, std::ios_base::binary); + fs.seekg(0, std::ios_base::end); + auto size = fs.tellg(); + fs.seekg(0); + out.resize(static_cast(size)); + fs.read(&out[0], size); + } + + inline std::string file_extension(const std::string& path) + { + std::smatch m; + auto pat = std::regex("\\.([a-zA-Z0-9]+)$"); + if (std::regex_search(path, m, pat)) { + return m[1].str(); + } + return std::string(); + } + + inline const char* find_content_type(const std::string& path) + { + auto ext = file_extension(path); + if (ext == "txt") { + return "text/plain"; + } else if (ext == "html") { + return "text/html"; + } else if (ext == "css") { + return "text/css"; + } else if (ext == "jpeg" || ext == "jpg") { + return "image/jpg"; + } else if (ext == "png") { + return "image/png"; + } else if (ext == "gif") { + return "image/gif"; + } else if (ext == "svg") { + return "image/svg+xml"; + } else if (ext == "ico") { + return "image/x-icon"; + } else if (ext == "json") { + return "application/json"; + } else if (ext == "pdf") { + return "application/pdf"; + } else if (ext == "js") { + return "application/javascript"; + } else if (ext == "xml") { + return "application/xml"; + } else if (ext == "xhtml") { + return "application/xhtml+xml"; + } + return nullptr; + } + + inline const char* status_message(int status) + { + switch (status) { + case 200: return "OK"; + case 301: return "Moved Permanently"; + case 302: return "Found"; + case 303: return "See Other"; + case 304: return "Not Modified"; + case 400: return "Bad Request"; + case 403: return "Forbidden"; + case 404: return "Not Found"; + case 415: return "Unsupported Media Type"; + default: + case 500: return "Internal Server Error"; + } + } + + inline const char* get_header_value(const Headers& headers, const char* key, const char* def) + { + auto it = headers.find(key); + if (it != headers.end()) { + return it->second.c_str(); + } + return def; + } + + inline int get_header_value_int(const Headers& headers, const char* key, int def) + { + auto it = headers.find(key); + if (it != headers.end()) { + return std::stoi(it->second); + } + return def; + } + + inline bool read_headers(Stream& strm, Headers& headers) + { + static std::regex re(R"((.+?):\s*(.+?)\s*\r\n)"); + + const auto bufsiz = 2048; + char buf[bufsiz]; + + stream_line_reader reader(strm, buf, bufsiz); + + for (;;) { + if (!reader.getline()) { + return false; + } + if (!strcmp(reader.ptr(), "\r\n")) { break; } - } - x.body += byte; - } - } - return true; -} - -template -inline void write_headers(Stream& strm, const T& res) -{ - strm.write("Connection: close\r\n"); - - for (const auto& x: res.headers) { - if (x.first != "Content-Type" && x.first != "Content-Length") { - socket_printf(strm, "%s: %s\r\n", x.first.c_str(), x.second.c_str()); - } - } - - auto t = get_header_value(res.headers, "Content-Type", "text/plain"); - socket_printf(strm, "Content-Type: %s\r\n", t); - socket_printf(strm, "Content-Length: %ld\r\n", res.body.size()); - strm.write("\r\n"); -} - -inline void write_response(Stream& strm, const Request& req, const Response& res) -{ - socket_printf(strm, "HTTP/1.0 %d %s\r\n", res.status, status_message(res.status)); - - write_headers(strm, res); - - if (!res.body.empty() && req.method != "HEAD") { - strm.write(res.body.c_str(), res.body.size()); - } -} - -inline std::string encode_url(const std::string& s) -{ - std::string result; - - for (auto i = 0; s[i]; i++) { - switch (s[i]) { - case ' ': result += "+"; break; - case '\'': result += "%27"; break; - case ',': result += "%2C"; break; - case ':': result += "%3A"; break; - case ';': result += "%3B"; break; - default: - if (s[i] < 0) { - result += '%'; - char hex[4]; - size_t len = snprintf(hex, sizeof(hex), "%02X", (unsigned char)s[i]); - assert(len == 2); - result.append(hex, len); - } else { - result += s[i]; - } - break; - } - } - - return result; -} - -inline bool is_hex(char c, int& v) -{ - if (0x20 <= c && isdigit(c)) { - v = c - '0'; - return true; - } else if ('A' <= c && c <= 'F') { - v = c - 'A' + 10; - return true; - } else if ('a' <= c && c <= 'f') { - v = c - 'a' + 10; - return true; - } - return false; -} - -inline int from_hex_to_i(const std::string& s, int i, int cnt, int& val) -{ - val = 0; - for (; s[i] && cnt; i++, cnt--) { - int v = 0; - if (is_hex(s[i], v)) { - val = val * 16 + v; - } else { - break; - } - } - return --i; -} - -inline size_t to_utf8(int code, char* buff) -{ - if (code < 0x0080) { - buff[0] = (code & 0x7F); - return 1; - } else if (code < 0x0800) { - buff[0] = (0xC0 | ((code >> 6) & 0x1F)); - buff[1] = (0x80 | (code & 0x3F)); - return 2; - } else if (code < 0xD800) { - buff[0] = (0xE0 | ((code >> 12) & 0xF)); - buff[1] = (0x80 | ((code >> 6) & 0x3F)); - buff[2] = (0x80 | (code & 0x3F)); - return 3; - } else if (code < 0xE000) { // D800 - DFFF is invalid... - return 0; - } else if (code < 0x10000) { - buff[0] = (0xE0 | ((code >> 12) & 0xF)); - buff[1] = (0x80 | ((code >> 6) & 0x3F)); - buff[2] = (0x80 | (code & 0x3F)); - return 3; - } else if (code < 0x110000) { - buff[0] = (0xF0 | ((code >> 18) & 0x7)); - buff[1] = (0x80 | ((code >> 12) & 0x3F)); - buff[2] = (0x80 | ((code >> 6) & 0x3F)); - buff[3] = (0x80 | (code & 0x3F)); - return 4; - } - - // NOTREACHED - return 0; -} - -inline std::string decode_url(const std::string& s) -{ - std::string result; - - for (int i = 0; s[i]; i++) { - if (s[i] == '%') { - i++; - assert(s[i]); - - if (s[i] == '%') { - result += s[i]; - } else if (s[i] == 'u') { - // Unicode - i++; - assert(s[i]); - - int val = 0; - i = from_hex_to_i(s, i, 4, val); - - char buff[4]; - size_t len = to_utf8(val, buff); - - if (len > 0) { - result.append(buff, len); + std::cmatch m; + if (std::regex_match(reader.ptr(), m, re)) { + auto key = std::string(m[1]); + auto val = std::string(m[2]); + headers.emplace(key, val); } - } else { - // HEX - int val = 0; - i = from_hex_to_i(s, i, 2, val); - result += val; } - } else if (s[i] == '+') { - result += ' '; - } else { - result += s[i]; + + return true; } + + inline bool read_content_with_length(Stream& strm, std::string& out, size_t len, Progress progress) + { + out.assign(len, 0); + size_t r = 0; + while (r < len){ + auto n = strm.read(&out[r], len - r); + if (n <= 0) { + return false; + } + + r += n; + + if (progress) { + progress(r, len); + } + } + + return true; + } + + inline bool read_content_without_length(Stream& strm, std::string& out) + { + for (;;) { + char byte; + auto n = strm.read(&byte, 1); + if (n < 0) { + return false; + } else if (n == 0) { + return true; + } + out += byte; + } + + return true; + } + + inline bool read_content_chunked(Stream& strm, std::string& out) + { + const auto bufsiz = 16; + char buf[bufsiz]; + + stream_line_reader reader(strm, buf, bufsiz); + + if (!reader.getline()) { + return false; + } + + auto chunk_len = std::stoi(reader.ptr(), 0, 16); + + while (chunk_len > 0){ + std::string chunk; + if (!read_content_with_length(strm, chunk, chunk_len, nullptr)) { + return false; + } + + if (!reader.getline()) { + return false; + } + + if (strcmp(reader.ptr(), "\r\n")) { + break; + } + + out += chunk; + + if (!reader.getline()) { + return false; + } + + chunk_len = std::stoi(reader.ptr(), 0, 16); + } + + if (chunk_len == 0) { + // Reader terminator after chunks + if (!reader.getline() || strcmp(reader.ptr(), "\r\n")) + return false; + } + + return true; + } + + template + bool read_content(Stream& strm, T& x, Progress progress = Progress()) + { + auto len = get_header_value_int(x.headers, "Content-Length", 0); + + if (len) { + return read_content_with_length(strm, x.body, len, progress); + } else { + const auto& encoding = get_header_value(x.headers, "Transfer-Encoding", ""); + + if (!strcasecmp(encoding, "chunked")) { + return read_content_chunked(strm, x.body); + } else { + return read_content_without_length(strm, x.body); + } + } + + return true; + } + + template + inline void write_headers(Stream& strm, const T& info) + { + for (const auto& x: info.headers) { + strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str()); + } + strm.write("\r\n"); + } + + inline std::string encode_url(const std::string& s) + { + std::string result; + + for (auto i = 0; s[i]; i++) { + switch (s[i]) { + case ' ': result += "+"; break; + case '\'': result += "%27"; break; + case ',': result += "%2C"; break; + case ':': result += "%3A"; break; + case ';': result += "%3B"; break; + default: + if (s[i] < 0) { + result += '%'; + char hex[4]; + size_t len = snprintf(hex, sizeof(hex) - 1, "%02X", (unsigned char)s[i]); + assert(len == 2); + result.append(hex, len); + } else { + result += s[i]; + } + break; + } + } + + return result; + } + + inline bool is_hex(char c, int& v) + { + if (0x20 <= c && isdigit(c)) { + v = c - '0'; + return true; + } else if ('A' <= c && c <= 'F') { + v = c - 'A' + 10; + return true; + } else if ('a' <= c && c <= 'f') { + v = c - 'a' + 10; + return true; + } + return false; + } + + inline bool from_hex_to_i(const std::string& s, size_t i, size_t cnt, int& val) + { + if (i >= s.size()) { + return false; + } + + val = 0; + for (; cnt; i++, cnt--) { + if (!s[i]) { + return false; + } + int v = 0; + if (is_hex(s[i], v)) { + val = val * 16 + v; + } else { + return false; + } + } + return true; + } + + inline size_t to_utf8(int code, char* buff) + { + if (code < 0x0080) { + buff[0] = (code & 0x7F); + return 1; + } else if (code < 0x0800) { + buff[0] = (0xC0 | ((code >> 6) & 0x1F)); + buff[1] = (0x80 | (code & 0x3F)); + return 2; + } else if (code < 0xD800) { + buff[0] = (0xE0 | ((code >> 12) & 0xF)); + buff[1] = (0x80 | ((code >> 6) & 0x3F)); + buff[2] = (0x80 | (code & 0x3F)); + return 3; + } else if (code < 0xE000) { // D800 - DFFF is invalid... + return 0; + } else if (code < 0x10000) { + buff[0] = (0xE0 | ((code >> 12) & 0xF)); + buff[1] = (0x80 | ((code >> 6) & 0x3F)); + buff[2] = (0x80 | (code & 0x3F)); + return 3; + } else if (code < 0x110000) { + buff[0] = (0xF0 | ((code >> 18) & 0x7)); + buff[1] = (0x80 | ((code >> 12) & 0x3F)); + buff[2] = (0x80 | ((code >> 6) & 0x3F)); + buff[3] = (0x80 | (code & 0x3F)); + return 4; + } + + // NOTREACHED + return 0; + } + + inline std::string decode_url(const std::string& s) + { + std::string result; + + for (size_t i = 0; i < s.size(); i++) { + if (s[i] == '%' && i + 1 < s.size()) { + if (s[i + 1] == 'u') { + int val = 0; + if (from_hex_to_i(s, i + 2, 4, val)) { + // 4 digits Unicode codes + char buff[4]; + size_t len = to_utf8(val, buff); + if (len > 0) { + result.append(buff, len); + } + i += 5; // 'u0000' + } else { + result += s[i]; + } + } else { + int val = 0; + if (from_hex_to_i(s, i + 1, 2, val)) { + // 2 digits hex codes + result += val; + i += 2; // '00' + } else { + result += s[i]; + } + } + } else if (s[i] == '+') { + result += ' '; + } else { + result += s[i]; + } + } + + return result; + } + + inline void parse_query_text(const std::string& s, Params& params) + { + split(&s[0], &s[s.size()], '&', [&](const char* b, const char* e) { + std::string key; + std::string val; + split(b, e, '=', [&](const char* b, const char* e) { + if (key.empty()) { + key.assign(b, e); + } else { + val.assign(b, e); + } + }); + params.emplace(key, decode_url(val)); + }); + } + + inline bool parse_multipart_boundary(const std::string& content_type, std::string& boundary) + { + auto pos = content_type.find("boundary="); + if (pos == std::string::npos) { + return false; + } + + boundary = content_type.substr(pos + 9); + return true; + } + + inline bool parse_multipart_formdata( + const std::string& boundary, const std::string& body, MultipartFiles& files) + { + static std::string dash = "--"; + static std::string crlf = "\r\n"; + + static std::regex re_content_type( + "Content-Type: (.*?)", std::regex_constants::icase); + + static std::regex re_content_disposition( + "Content-Disposition: form-data; name=\"(.*?)\"(?:; filename=\"(.*?)\")?", + std::regex_constants::icase); + + auto dash_boundary = dash + boundary; + + auto pos = body.find(dash_boundary); + if (pos != 0) { + return false; + } + + pos += dash_boundary.size(); + + auto next_pos = body.find(crlf, pos); + if (next_pos == std::string::npos) { + return false; + } + + pos = next_pos + crlf.size(); + + while (pos < body.size()) { + next_pos = body.find(crlf, pos); + if (next_pos == std::string::npos) { + return false; + } + + std::string name; + MultipartFile file; + + auto header = body.substr(pos, (next_pos - pos)); + + while (pos != next_pos) { + std::smatch m; + if (std::regex_match(header, m, re_content_type)) { + file.content_type = m[1]; + } else if (std::regex_match(header, m, re_content_disposition)) { + name = m[1]; + file.filename = m[2]; + } + + pos = next_pos + crlf.size(); + + next_pos = body.find(crlf, pos); + if (next_pos == std::string::npos) { + return false; + } + + header = body.substr(pos, (next_pos - pos)); + } + + pos = next_pos + crlf.size(); + + next_pos = body.find(crlf + dash_boundary, pos); + + if (next_pos == std::string::npos) { + return false; + } + + file.offset = pos; + file.length = next_pos - pos; + + pos = next_pos + crlf.size() + dash_boundary.size(); + + next_pos = body.find(crlf, pos); + if (next_pos == std::string::npos) { + return false; + } + + files.emplace(name, file); + + pos = next_pos + crlf.size(); + } + + return true; + } + + inline std::string to_lower(const char* beg, const char* end) + { + std::string out; + auto it = beg; + while (it != end) { + out += ::tolower(*it); + it++; + } + return out; + } + + inline void make_range_header_core(std::string&) {} + + template + inline void make_range_header_core(std::string& field, uint64_t value) + { + if (!field.empty()) { + field += ", "; + } + field += std::to_string(value) + "-"; + } + + template + inline void make_range_header_core(std::string& field, uint64_t value1, uint64_t value2, Args... args) + { + if (!field.empty()) { + field += ", "; + } + field += std::to_string(value1) + "-" + std::to_string(value2); + make_range_header_core(field, args...); + } + +#ifdef CPPHTTPLIB_ZLIB_SUPPORT + inline bool can_compress(const std::string& content_type) { + return !content_type.find("text/") || + content_type == "image/svg+xml" || + content_type == "application/javascript" || + content_type == "application/json" || + content_type == "application/xml" || + content_type == "application/xhtml+xml"; +} + +inline void compress(std::string& content) +{ + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; + + auto ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY); + if (ret != Z_OK) { + return; } - return result; + strm.avail_in = content.size(); + strm.next_in = (Bytef *)content.data(); + + std::string compressed; + + const auto bufsiz = 16384; + char buff[bufsiz]; + do { + strm.avail_out = bufsiz; + strm.next_out = (Bytef *)buff; + deflate(&strm, Z_FINISH); + compressed.append(buff, bufsiz - strm.avail_out); + } while (strm.avail_out == 0); + + content.swap(compressed); + + deflateEnd(&strm); } -inline void write_request(Stream& strm, const Request& req) +inline void decompress(std::string& content) { - auto path = encode_url(req.path); - socket_printf(strm, "%s %s HTTP/1.0\r\n", req.method.c_str(), path.c_str()); + z_stream strm; + strm.zalloc = Z_NULL; + strm.zfree = Z_NULL; + strm.opaque = Z_NULL; - write_headers(strm, req); - - if (!req.body.empty()) { - if (req.has_header("application/x-www-form-urlencoded")) { - auto str = encode_url(req.body); - strm.write(str.c_str(), str.size()); - } else { - strm.write(req.body.c_str(), req.body.size()); - } + // 15 is the value of wbits, which should be at the maximum possible value to ensure + // that any gzip stream can be decoded. The offset of 16 specifies that the stream + // to decompress will be formatted with a gzip wrapper. + auto ret = inflateInit2(&strm, 16 + 15); + if (ret != Z_OK) { + return; } -} -inline void parse_query_text(const std::string& s, Map& params) -{ - split(&s[0], &s[s.size()], '&', [&](const char* b, const char* e) { - std::string key; - std::string val; - split(b, e, '=', [&](const char* b, const char* e) { - if (key.empty()) { - key.assign(b, e); - } else { - val.assign(b, e); - } - }); - params[key] = detail::decode_url(val); - }); -} + strm.avail_in = content.size(); + strm.next_in = (Bytef *)content.data(); -#ifdef _MSC_VER -class WSInit { + std::string decompressed; + + const auto bufsiz = 16384; + char buff[bufsiz]; + do { + strm.avail_out = bufsiz; + strm.next_out = (Bytef *)buff; + inflate(&strm, Z_NO_FLUSH); + decompressed.append(buff, bufsiz - strm.avail_out); + } while (strm.avail_out == 0); + + content.swap(decompressed); + + inflateEnd(&strm); +} +#endif + +#ifdef _WIN32 + class WSInit { public: WSInit() { WSADATA wsaData; @@ -758,557 +1280,1073 @@ public: static WSInit wsinit_; #endif -} // namespace detail + } // namespace detail + +// Header utilities + template + inline std::pair make_range_header(uint64_t value, Args... args) + { + std::string field; + detail::make_range_header_core(field, value, args...); + field.insert(0, "bytes="); + return std::make_pair("Range", field); + } // Request implementation -inline bool Request::has_header(const char* key) const -{ - return headers.find(key) != headers.end(); -} + inline bool Request::has_header(const char* key) const + { + return headers.find(key) != headers.end(); + } -inline std::string Request::get_header_value(const char* key) const -{ - return detail::get_header_value(headers, key, ""); -} + inline std::string Request::get_header_value(const char* key) const + { + return detail::get_header_value(headers, key, ""); + } -inline void Request::set_header(const char* key, const char* val) -{ - headers.insert(std::make_pair(key, val)); -} + inline void Request::set_header(const char* key, const char* val) + { + headers.emplace(key, val); + } -inline bool Request::has_param(const char* key) const -{ - return params.find(key) != params.end(); -} + inline bool Request::has_param(const char* key) const + { + return params.find(key) != params.end(); + } + + inline std::string Request::get_param_value(const char* key) const + { + auto it = params.find(key); + if (it != params.end()) { + return it->second; + } + return std::string(); + } + + inline bool Request::has_file(const char* key) const + { + return files.find(key) != files.end(); + } + + inline MultipartFile Request::get_file_value(const char* key) const + { + auto it = files.find(key); + if (it != files.end()) { + return it->second; + } + return MultipartFile(); + } // Response implementation -inline bool Response::has_header(const char* key) const -{ - return headers.find(key) != headers.end(); -} + inline bool Response::has_header(const char* key) const + { + return headers.find(key) != headers.end(); + } -inline std::string Response::get_header_value(const char* key) const -{ - return detail::get_header_value(headers, key, ""); -} + inline std::string Response::get_header_value(const char* key) const + { + return detail::get_header_value(headers, key, ""); + } -inline void Response::set_header(const char* key, const char* val) -{ - headers.insert(std::make_pair(key, val)); -} + inline void Response::set_header(const char* key, const char* val) + { + headers.emplace(key, val); + } -inline void Response::set_redirect(const char* url) -{ - set_header("Location", url); - status = 302; -} + inline void Response::set_redirect(const char* url) + { + set_header("Location", url); + status = 302; + } -inline void Response::set_content(const char* s, size_t n, const char* content_type) -{ - body.assign(s, n); - set_header("Content-Type", content_type); -} + inline void Response::set_content(const char* s, size_t n, const char* content_type) + { + body.assign(s, n); + set_header("Content-Type", content_type); + } -inline void Response::set_content(const std::string& s, const char* content_type) -{ - body = s; - set_header("Content-Type", content_type); -} + inline void Response::set_content(const std::string& s, const char* content_type) + { + body = s; + set_header("Content-Type", content_type); + } + +// Rstream implementation + template + inline void Stream::write_format(const char* fmt, const Args& ...args) + { + const auto bufsiz = 2048; + char buf[bufsiz]; + +#if defined(_MSC_VER) && _MSC_VER < 1900 + auto n = _snprintf_s(buf, bufsiz, bufsiz - 1, fmt, args...); +#else + auto n = snprintf(buf, bufsiz - 1, fmt, args...); +#endif + if (n > 0) { + if (n >= bufsiz - 1) { + std::vector glowable_buf(bufsiz); + + while (n >= static_cast(glowable_buf.size() - 1)) { + glowable_buf.resize(glowable_buf.size() * 2); +#if defined(_MSC_VER) && _MSC_VER < 1900 + n = _snprintf_s(&glowable_buf[0], glowable_buf.size(), glowable_buf.size() - 1, fmt, args...); +#else + n = snprintf(&glowable_buf[0], glowable_buf.size() - 1, fmt, args...); +#endif + } + write(&glowable_buf[0], n); + } else { + write(buf, n); + } + } + } // Socket stream implementation -inline SocketStream::SocketStream(socket_t sock): sock_(sock) -{ -} + inline SocketStream::SocketStream(socket_t sock): sock_(sock) + { + } -inline SocketStream::~SocketStream() -{ -} + inline SocketStream::~SocketStream() + { + } -inline int SocketStream::read(char* ptr, size_t size) -{ - return recv(sock_, ptr, size, 0); -} + inline int SocketStream::read(char* ptr, size_t size) + { + return recv(sock_, ptr, size, 0); + } -inline int SocketStream::write(const char* ptr, size_t size) -{ - return send(sock_, ptr, size, 0); -} + inline int SocketStream::write(const char* ptr, size_t size) + { + return send(sock_, ptr, size, 0); + } -inline int SocketStream::write(const char* ptr) -{ - return write(ptr, strlen(ptr)); -} + inline int SocketStream::write(const char* ptr) + { + return write(ptr, strlen(ptr)); + } + + inline std::string SocketStream::get_remote_addr() { + return detail::get_remote_addr(sock_); + } // HTTP server implementation -inline Server::Server() - : svr_sock_(-1) -{ -#if !defined(_MSC_VER) && !defined(WIN32) - signal(SIGPIPE, SIG_IGN); + inline Server::Server() + : keep_alive_max_count_(5) + , is_running_(false) + , svr_sock_(INVALID_SOCKET) + , running_threads_(0) + { +#ifndef _WIN32 + signal(SIGPIPE, SIG_IGN); #endif -} - -inline Server::~Server() -{ -} - -inline void Server::get(const char* pattern, Handler handler) -{ - get_handlers_.push_back(std::make_pair(std::regex(pattern), handler)); -} - -inline void Server::post(const char* pattern, Handler handler) -{ - post_handlers_.push_back(std::make_pair(std::regex(pattern), handler)); -} - -inline bool Server::set_base_dir(const char* path) -{ - if (detail::is_dir(path)) { - base_dir_ = path; - return true; } - return false; -} -inline void Server::set_error_handler(Handler handler) -{ - error_handler_ = handler; -} + inline Server::~Server() + { + } -inline void Server::set_logger(Logger logger) -{ - logger_ = logger; -} + inline Server& Server::Get(const char* pattern, Handler handler) + { + get_handlers_.push_back(std::make_pair(std::regex(pattern), handler)); + return *this; + } -inline bool Server::listen(const char* host, int port, int socket_flags) -{ - svr_sock_ = detail::create_server_socket(host, port, socket_flags); - if (svr_sock_ == -1) { + inline Server& Server::Post(const char* pattern, Handler handler) + { + post_handlers_.push_back(std::make_pair(std::regex(pattern), handler)); + return *this; + } + + inline Server& Server::Put(const char* pattern, Handler handler) + { + put_handlers_.push_back(std::make_pair(std::regex(pattern), handler)); + return *this; + } + + inline Server& Server::Delete(const char* pattern, Handler handler) + { + delete_handlers_.push_back(std::make_pair(std::regex(pattern), handler)); + return *this; + } + + inline Server& Server::Options(const char* pattern, Handler handler) + { + options_handlers_.push_back(std::make_pair(std::regex(pattern), handler)); + return *this; + } + + inline bool Server::set_base_dir(const char* path) + { + if (detail::is_dir(path)) { + base_dir_ = path; + return true; + } return false; } - auto ret = true; + inline void Server::set_error_handler(Handler handler) + { + error_handler_ = handler; + } - for (;;) { - socket_t sock = accept(svr_sock_, NULL, NULL); + inline void Server::set_logger(Logger logger) + { + logger_ = logger; + } - if (sock == -1) { - if (svr_sock_ != -1) { - detail::close_socket(svr_sock_); - ret = false; + inline void Server::set_keep_alive_max_count(size_t count) + { + keep_alive_max_count_ = count; + } + + inline int Server::bind_to_any_port(const char* host, int socket_flags) + { + return bind_internal(host, 0, socket_flags); + } + + inline bool Server::listen_after_bind() { + return listen_internal(); + } + + inline bool Server::listen(const char* host, int port, int socket_flags) + { + if (bind_internal(host, port, socket_flags) < 0) + return false; + return listen_internal(); + } + + inline bool Server::is_running() const + { + return is_running_; + } + + inline void Server::stop() + { + if (is_running_) { + assert(svr_sock_ != INVALID_SOCKET); + detail::shutdown_socket(svr_sock_); + detail::close_socket(svr_sock_); + svr_sock_ = INVALID_SOCKET; + } + } + + inline bool Server::parse_request_line(const char* s, Request& req) + { + static std::regex re("(GET|HEAD|POST|PUT|DELETE|OPTIONS) (([^?]+)(?:\\?(.+?))?) (HTTP/1\\.[01])\r\n"); + + std::cmatch m; + if (std::regex_match(s, m, re)) { + req.version = std::string(m[4]); + req.method = std::string(m[1]); + req.target = std::string(m[2]); + req.path = detail::decode_url(m[3]); + + // Parse query text + auto len = std::distance(m[4].first, m[4].second); + if (len > 0) { + detail::parse_query_text(m[4], req.params); + } + + return true; + } + + return false; + } + + inline void Server::write_response(Stream& strm, bool last_connection, const Request& req, Response& res) + { + assert(res.status != -1); + + if (400 <= res.status && error_handler_) { + error_handler_(req, res); + } + + // Response line + strm.write_format("HTTP/1.1 %d %s\r\n", + res.status, + detail::status_message(res.status)); + + // Headers + if (last_connection || + req.version == "HTTP/1.0" || + req.get_header_value("Connection") == "close") { + res.set_header("Connection", "close"); + } + + if (!res.body.empty()) { +#ifdef CPPHTTPLIB_ZLIB_SUPPORT + // TODO: 'Accpet-Encoding' has gzip, not gzip;q=0 + const auto& encodings = req.get_header_value("Accept-Encoding"); + if (encodings.find("gzip") != std::string::npos && + detail::can_compress(res.get_header_value("Content-Type"))) { + detail::compress(res.body); + res.set_header("Content-Encoding", "gzip"); + } +#endif + + if (!res.has_header("Content-Type")) { + res.set_header("Content-Type", "text/plain"); + } + + auto length = std::to_string(res.body.size()); + res.set_header("Content-Length", length.c_str()); + } + + detail::write_headers(strm, res); + + // Body + if (!res.body.empty() && req.method != "HEAD") { + strm.write(res.body.c_str(), res.body.size()); + } + + // Log + if (logger_) { + logger_(req, res); + } + } + + inline bool Server::handle_file_request(Request& req, Response& res) + { + if (!base_dir_.empty() && detail::is_valid_path(req.path)) { + std::string path = base_dir_ + req.path; + + if (!path.empty() && path.back() == '/') { + path += "index.html"; + } + + if (detail::is_file(path)) { + detail::read_file(path, res.body); + auto type = detail::find_content_type(path); + if (type) { + res.set_header("Content-Type", type); + } + res.status = 200; + return true; + } + } + + return false; + } + + inline socket_t Server::create_server_socket(const char* host, int port, int socket_flags) const + { + return detail::create_socket(host, port, + [](socket_t sock, struct addrinfo& ai) -> bool { + if (::bind(sock, ai.ai_addr, ai.ai_addrlen)) { + return false; + } + if (::listen(sock, 5)) { // Listen through 5 channels + return false; + } + return true; + }, socket_flags); + } + + inline int Server::bind_internal(const char* host, int port, int socket_flags) + { + if (!is_valid()) { + return -1; + } + + svr_sock_ = create_server_socket(host, port, socket_flags); + if (svr_sock_ == INVALID_SOCKET) { + return -1; + } + + if (port == 0) { + struct sockaddr_storage address; + socklen_t len = sizeof(address); + if (getsockname(svr_sock_, reinterpret_cast(&address), &len) == -1) { + return -1; + } + if (address.ss_family == AF_INET) { + return ntohs(reinterpret_cast(&address)->sin_port); + } else if (address.ss_family == AF_INET6) { + return ntohs(reinterpret_cast(&address)->sin6_port); } else { - ; // The server socket was closed by user. + return -1; } - break; + } else { + return port; } - - // TODO: should be async - read_and_close_socket(sock); } - return ret; -} + inline bool Server::listen_internal() + { + auto ret = true; -inline void Server::stop() -{ - detail::shutdown_socket(svr_sock_); - detail::close_socket(svr_sock_); - svr_sock_ = -1; -} + is_running_ = true; -inline bool Server::read_request_line(Stream& strm, Request& req) -{ - const auto BUFSIZ_REQUESTLINE = 2048; - char buf[BUFSIZ_REQUESTLINE]; - if (!detail::socket_gets(strm, buf, BUFSIZ_REQUESTLINE)) { - return false; - } + for (;;) { + auto val = detail::select_read(svr_sock_, 0, 100000); - static std::regex re("(GET|HEAD|POST) ([^?]+)(?:\\?(.+?))? HTTP/1\\.[01]\r\n"); - - std::cmatch m; - if (std::regex_match(buf, m, re)) { - req.method = std::string(m[1]); - req.path = detail::decode_url(m[2]); - - // Parse query text - auto len = std::distance(m[3].first, m[3].second); - if (len > 0) { - detail::parse_query_text(m[3], req.params); - } - - return true; - } - - return false; -} - -inline bool Server::handle_file_request(Request& req, Response& res) -{ - if (!base_dir_.empty()) { - std::string path = base_dir_ + req.path; - - if (!path.empty() && path.back() == '/') { - path += "index.html"; - } - - if (detail::is_file(path)) { - detail::read_file(path, res.body); - auto type = detail::content_type(path); - if (type) { - res.set_header("Content-Type", type); + if (val == 0) { // Timeout + if (svr_sock_ == INVALID_SOCKET) { + // The server socket was closed by 'stop' method. + break; + } + continue; } - res.status = 200; + + socket_t sock = accept(svr_sock_, NULL, NULL); + + if (sock == INVALID_SOCKET) { + if (svr_sock_ != INVALID_SOCKET) { + detail::close_socket(svr_sock_); + ret = false; + } else { + ; // The server socket was closed by user. + } + break; + } + + // TODO: Use thread pool... + std::thread([=]() { + { + std::lock_guard guard(running_threads_mutex_); + running_threads_++; + } + + read_and_close_socket(sock); + + { + std::lock_guard guard(running_threads_mutex_); + running_threads_--; + } + }).detach(); + } + + // TODO: Use thread pool... + for (;;) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + std::lock_guard guard(running_threads_mutex_); + if (!running_threads_) { + break; + } + } + + is_running_ = false; + + return ret; + } + + inline bool Server::routing(Request& req, Response& res) + { + if (req.method == "GET" && handle_file_request(req, res)) { return true; } - } - return false; -} - -inline bool Server::routing(Request& req, Response& res) -{ - if (req.method == "GET" && handle_file_request(req, res)) { - return true; - } - - if (req.method == "GET" || req.method == "HEAD") { - return dispatch_request(req, res, get_handlers_); - } else if (req.method == "POST") { - return dispatch_request(req, res, post_handlers_); - } - return false; -} - -inline bool Server::dispatch_request(Request& req, Response& res, Handlers& handlers) -{ - for (const auto& x: handlers) { - const auto& pattern = x.first; - const auto& handler = x.second; - - if (std::regex_match(req.path, req.matches, pattern)) { - handler(req, res); - return true; + if (req.method == "GET" || req.method == "HEAD") { + return dispatch_request(req, res, get_handlers_); + } else if (req.method == "POST") { + return dispatch_request(req, res, post_handlers_); + } else if (req.method == "PUT") { + return dispatch_request(req, res, put_handlers_); + } else if (req.method == "DELETE") { + return dispatch_request(req, res, delete_handlers_); + } else if (req.method == "OPTIONS") { + return dispatch_request(req, res, options_handlers_); } - } - return false; -} - -inline void Server::process_request(Stream& strm) -{ - Request req; - Response res; - - if (!read_request_line(strm, req) || - !detail::read_headers(strm, req.headers)) { - // TODO: - return; - } - - if (req.method == "POST") { - if (!detail::read_content(strm, req, false)) { - // TODO: - return; - } - static std::string type = "application/x-www-form-urlencoded"; - if (!req.get_header_value("Content-Type").compare(0, type.size(), type)) { - detail::parse_query_text(req.body, req.params); - } - } - - if (routing(req, res)) { - if (res.status == -1) { - res.status = 200; - } - } else { - res.status = 404; - } - assert(res.status != -1); - - if (400 <= res.status && error_handler_) { - error_handler_(req, res); - } - - detail::write_response(strm, req, res); - - if (logger_) { - logger_(req, res); - } -} - -inline bool Server::read_and_close_socket(socket_t sock) -{ - return detail::read_and_close_socket(sock, [this](Stream& strm) { - process_request(strm); - return true; - }); -} - -// HTTP client implementation -inline Client::Client(const char* host, int port) - : host_(host) - , port_(port) - , host_and_port_(host_ + ":" + std::to_string(port_)) -{ -} - -inline Client::~Client() -{ -} - -inline bool Client::read_response_line(Stream& strm, Response& res) -{ - const auto BUFSIZ_RESPONSELINE = 2048; - char buf[BUFSIZ_RESPONSELINE]; - if (!detail::socket_gets(strm, buf, BUFSIZ_RESPONSELINE)) { return false; } - const static std::regex re("HTTP/1\\.[01] (\\d+?) .+\r\n"); + inline bool Server::dispatch_request(Request& req, Response& res, Handlers& handlers) + { + for (const auto& x: handlers) { + const auto& pattern = x.first; + const auto& handler = x.second; - std::cmatch m; - if (std::regex_match(buf, m, re)) { - res.status = std::stoi(std::string(m[1])); - } - - return true; -} - -inline bool Client::send(const Request& req, Response& res) -{ - auto sock = detail::create_client_socket(host_.c_str(), port_); - if (sock == -1) { + if (std::regex_match(req.path, req.matches, pattern)) { + handler(req, res); + return true; + } + } return false; } - return read_and_close_socket(sock, req, res); -} + inline bool Server::process_request(Stream& strm, bool last_connection, bool& connection_close) + { + const auto bufsiz = 2048; + char buf[bufsiz]; -inline bool Client::process_request(Stream& strm, const Request& req, Response& res) -{ - // Send request - detail::write_request(strm, req); + detail::stream_line_reader reader(strm, buf, bufsiz); - // Receive response - if (!read_response_line(strm, res) || - !detail::read_headers(strm, res.headers)) { - return false; - } - if (req.method != "HEAD") { - if (!detail::read_content(strm, res, true)) { + // Connection has been closed on client + if (!reader.getline()) { return false; } - } - return true; -} + Request req; + Response res; -inline bool Client::read_and_close_socket(socket_t sock, const Request& req, Response& res) -{ - return detail::read_and_close_socket(sock, [&](Stream& strm) { - return process_request(strm, req, res); - }); -} + res.version = "HTTP/1.1"; -inline void Client::add_default_headers(Request& req) -{ - req.set_header("Host", host_and_port_.c_str()); - req.set_header("Accept", "*/*"); - req.set_header("User-Agent", "cpp-httplib/0.1"); -} - -inline std::shared_ptr Client::get(const char* path) -{ - Request req; - req.method = "GET"; - req.path = path; - add_default_headers(req); - - auto res = std::make_shared(); - - return send(req, *res) ? res : nullptr; -} - -inline std::shared_ptr Client::head(const char* path) -{ - Request req; - req.method = "HEAD"; - req.path = path; - add_default_headers(req); - - auto res = std::make_shared(); - - return send(req, *res) ? res : nullptr; -} - -inline std::shared_ptr Client::post( - const char* path, const std::string& body, const char* content_type) -{ - Request req; - req.method = "POST"; - req.path = path; - add_default_headers(req); - - req.set_header("Content-Type", content_type); - req.body = body; - - auto res = std::make_shared(); - - return send(req, *res) ? res : nullptr; -} - -inline std::shared_ptr Client::post( - const char* path, const Map& params) -{ - std::string query; - for (auto it = params.begin(); it != params.end(); ++it) { - if (it != params.begin()) { - query += "&"; + // Request line and headers + if (!parse_request_line(reader.ptr(), req) || !detail::read_headers(strm, req.headers)) { + res.status = 400; + write_response(strm, last_connection, req, res); + return true; } - query += it->first; - query += "="; - query += it->second; + + auto ret = true; + if (req.get_header_value("Connection") == "close") { + // ret = false; + connection_close = true; + } + + req.set_header("REMOTE_ADDR", strm.get_remote_addr().c_str()); + + // Body + if (req.method == "POST" || req.method == "PUT") { + if (!detail::read_content(strm, req)) { + res.status = 400; + write_response(strm, last_connection, req, res); + return ret; + } + + const auto& content_type = req.get_header_value("Content-Type"); + + if (req.get_header_value("Content-Encoding") == "gzip") { +#ifdef CPPHTTPLIB_ZLIB_SUPPORT + detail::decompress(req.body); +#else + res.status = 415; + write_response(strm, last_connection, req, res); + return ret; +#endif + } + + if (!content_type.find("application/x-www-form-urlencoded")) { + detail::parse_query_text(req.body, req.params); + } else if(!content_type.find("multipart/form-data")) { + std::string boundary; + if (!detail::parse_multipart_boundary(content_type, boundary) || + !detail::parse_multipart_formdata(boundary, req.body, req.files)) { + res.status = 400; + write_response(strm, last_connection, req, res); + return ret; + } + } + } + + if (routing(req, res)) { + if (res.status == -1) { + res.status = 200; + } + } else { + res.status = 404; + } + + write_response(strm, last_connection, req, res); + return ret; } - return post(path, query, "application/x-www-form-urlencoded"); -} + inline bool Server::is_valid() const + { + return true; + } + + inline bool Server::read_and_close_socket(socket_t sock) + { + return detail::read_and_close_socket( + sock, + keep_alive_max_count_, + [this](Stream& strm, bool last_connection, bool& connection_close) { + return process_request(strm, last_connection, connection_close); + }); + } + +// HTTP client implementation + inline Client::Client( + const char* host, int port, size_t timeout_sec) + : host_(host) + , port_(port) + , timeout_sec_(timeout_sec) + , host_and_port_(host_ + ":" + std::to_string(port_)) + { + } + + inline Client::~Client() + { + } + + inline bool Client::is_valid() const + { + return true; + } + + inline socket_t Client::create_client_socket() const + { + return detail::create_socket(host_.c_str(), port_, + [=](socket_t sock, struct addrinfo& ai) -> bool { + detail::set_nonblocking(sock, true); + + auto ret = connect(sock, ai.ai_addr, ai.ai_addrlen); + if (ret < 0) { + if (detail::is_connection_error() || + !detail::wait_until_socket_is_ready(sock, timeout_sec_, 0)) { + detail::close_socket(sock); + return false; + } + } + + detail::set_nonblocking(sock, false); + return true; + }); + } + + inline bool Client::read_response_line(Stream& strm, Response& res) + { + const auto bufsiz = 2048; + char buf[bufsiz]; + + detail::stream_line_reader reader(strm, buf, bufsiz); + + if (!reader.getline()) { + return false; + } + + const static std::regex re("(HTTP/1\\.[01]) (\\d+?) .+\r\n"); + + std::cmatch m; + if (std::regex_match(reader.ptr(), m, re)) { + res.version = std::string(m[1]); + res.status = std::stoi(std::string(m[2])); + } + + return true; + } + + inline bool Client::send(Request& req, Response& res) + { + if (req.path.empty()) { + return false; + } + + auto sock = create_client_socket(); + if (sock == INVALID_SOCKET) { + return false; + } + + return read_and_close_socket(sock, req, res); + } + + inline void Client::write_request(Stream& strm, Request& req) + { + auto path = detail::encode_url(req.path); + + // Request line + strm.write_format("%s %s HTTP/1.1\r\n", + req.method.c_str(), + path.c_str()); + + // Headers + req.set_header("Host", host_and_port_.c_str()); + + if (!req.has_header("Accept")) { + req.set_header("Accept", "*/*"); + } + + if (!req.has_header("User-Agent")) { + req.set_header("User-Agent", "cpp-httplib/0.2"); + } + + // TODO: Support KeepAlive connection + // if (!req.has_header("Connection")) { + req.set_header("Connection", "close"); + // } + + if (!req.body.empty()) { + if (!req.has_header("Content-Type")) { + req.set_header("Content-Type", "text/plain"); + } + + auto length = std::to_string(req.body.size()); + req.set_header("Content-Length", length.c_str()); + } + + detail::write_headers(strm, req); + + // Body + if (!req.body.empty()) { + if (req.get_header_value("Content-Type") == "application/x-www-form-urlencoded") { + auto str = detail::encode_url(req.body); + strm.write(str.c_str(), str.size()); + } else { + strm.write(req.body.c_str(), req.body.size()); + } + } + } + + inline bool Client::process_request(Stream& strm, Request& req, Response& res, bool& connection_close) + { + // Send request + write_request(strm, req); + + // Receive response and headers + if (!read_response_line(strm, res) || !detail::read_headers(strm, res.headers)) { + return false; + } + + if (res.get_header_value("Connection") == "close" || res.version == "HTTP/1.0") { + connection_close = true; + } + + // Body + if (req.method != "HEAD") { + if (!detail::read_content(strm, res, req.progress)) { + return false; + } + + if (res.get_header_value("Content-Encoding") == "gzip") { +#ifdef CPPHTTPLIB_ZLIB_SUPPORT + detail::decompress(res.body); +#else + return false; +#endif + } + } + + return true; + } + + inline bool Client::read_and_close_socket(socket_t sock, Request& req, Response& res) + { + return detail::read_and_close_socket( + sock, + 0, + [&](Stream& strm, bool /*last_connection*/, bool& connection_close) { + return process_request(strm, req, res, connection_close); + }); + } + + inline std::shared_ptr Client::Get(const char* path, Progress progress) + { + return Get(path, Headers(), progress); + } + + inline std::shared_ptr Client::Get(const char* path, const Headers& headers, Progress progress) + { + Request req; + req.method = "GET"; + req.path = path; + req.headers = headers; + req.progress = progress; + + auto res = std::make_shared(); + + return send(req, *res) ? res : nullptr; + } + + inline std::shared_ptr Client::Head(const char* path) + { + return Head(path, Headers()); + } + + inline std::shared_ptr Client::Head(const char* path, const Headers& headers) + { + Request req; + req.method = "HEAD"; + req.headers = headers; + req.path = path; + + auto res = std::make_shared(); + + return send(req, *res) ? res : nullptr; + } + + inline std::shared_ptr Client::Post( + const char* path, const std::string& body, const char* content_type) + { + return Post(path, Headers(), body, content_type); + } + + inline std::shared_ptr Client::Post( + const char* path, const Headers& headers, const std::string& body, const char* content_type) + { + Request req; + req.method = "POST"; + req.headers = headers; + req.path = path; + + req.headers.emplace("Content-Type", content_type); + req.body = body; + + auto res = std::make_shared(); + + return send(req, *res) ? res : nullptr; + } + + inline std::shared_ptr Client::Post(const char* path, const Params& params) + { + return Post(path, Headers(), params); + } + + inline std::shared_ptr Client::Post(const char* path, const Headers& headers, const Params& params) + { + std::string query; + for (auto it = params.begin(); it != params.end(); ++it) { + if (it != params.begin()) { + query += "&"; + } + query += it->first; + query += "="; + query += it->second; + } + + return Post(path, headers, query, "application/x-www-form-urlencoded"); + } + + inline std::shared_ptr Client::Put( + const char* path, const std::string& body, const char* content_type) + { + return Put(path, Headers(), body, content_type); + } + + inline std::shared_ptr Client::Put( + const char* path, const Headers& headers, const std::string& body, const char* content_type) + { + Request req; + req.method = "PUT"; + req.headers = headers; + req.path = path; + + req.headers.emplace("Content-Type", content_type); + req.body = body; + + auto res = std::make_shared(); + + return send(req, *res) ? res : nullptr; + } + + inline std::shared_ptr Client::Delete(const char* path) + { + return Delete(path, Headers()); + } + + inline std::shared_ptr Client::Delete(const char* path, const Headers& headers) + { + Request req; + req.method = "DELETE"; + req.path = path; + req.headers = headers; + + auto res = std::make_shared(); + + return send(req, *res) ? res : nullptr; + } + + inline std::shared_ptr Client::Options(const char* path) + { + return Options(path, Headers()); + } + + inline std::shared_ptr Client::Options(const char* path, const Headers& headers) + { + Request req; + req.method = "OPTIONS"; + req.path = path; + req.headers = headers; + + auto res = std::make_shared(); + + return send(req, *res) ? res : nullptr; + } /* * SSL Implementation */ #ifdef CPPHTTPLIB_OPENSSL_SUPPORT -namespace detail { + namespace detail { -template -inline bool read_and_close_socket_ssl(socket_t sock, SSL_CTX* ctx, U SSL_connect_or_accept, V setup, T callback) -{ - auto ssl = SSL_new(ctx); + template + inline bool read_and_close_socket_ssl( + socket_t sock, size_t keep_alive_max_count, + // TODO: OpenSSL 1.0.2 occasionally crashes... + // The upcoming 1.1.0 is going to be thread safe. + SSL_CTX* ctx, std::mutex& ctx_mutex, + U SSL_connect_or_accept, V setup, + T callback) + { + SSL* ssl = nullptr; + { + std::lock_guard guard(ctx_mutex); - auto bio = BIO_new_socket(sock, BIO_NOCLOSE); - SSL_set_bio(ssl, bio, bio); + ssl = SSL_new(ctx); + if (!ssl) { + return false; + } + } - setup(ssl); + auto bio = BIO_new_socket(sock, BIO_NOCLOSE); + SSL_set_bio(ssl, bio, bio); - SSL_connect_or_accept(ssl); + setup(ssl); - SSLSocketStream strm(ssl); - auto ret = callback(strm); + SSL_connect_or_accept(ssl); - SSL_shutdown(ssl); - SSL_free(ssl); - close_socket(sock); - return ret; -} + bool ret = false; -class SSLInit { -public: - SSLInit() { - SSL_load_error_strings(); - SSL_library_init(); - } -}; + if (keep_alive_max_count > 0) { + auto count = keep_alive_max_count; + while (count > 0 && + detail::select_read(sock, + CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND, + CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND) > 0) { + SSLSocketStream strm(sock, ssl); + auto last_connection = count == 1; + auto connection_close = false; -static SSLInit sslinit_; + ret = callback(strm, last_connection, connection_close); + if (!ret || connection_close) { + break; + } -} // namespace detail + count--; + } + } else { + SSLSocketStream strm(sock, ssl); + auto dummy_connection_close = false; + ret = callback(strm, true, dummy_connection_close); + } + + SSL_shutdown(ssl); + + { + std::lock_guard guard(ctx_mutex); + SSL_free(ssl); + } + + close_socket(sock); + + return ret; + } + + class SSLInit { + public: + SSLInit() { + SSL_load_error_strings(); + SSL_library_init(); + } + }; + + static SSLInit sslinit_; + + } // namespace detail // SSL socket stream implementation -inline SSLSocketStream::SSLSocketStream(SSL* ssl): ssl_(ssl) -{ -} + inline SSLSocketStream::SSLSocketStream(socket_t sock, SSL* ssl) + : sock_(sock), ssl_(ssl) + { + } -inline SSLSocketStream::~SSLSocketStream() -{ -} + inline SSLSocketStream::~SSLSocketStream() + { + } -inline int SSLSocketStream::read(char* ptr, size_t size) -{ - return SSL_read(ssl_, ptr, size); -} + inline int SSLSocketStream::read(char* ptr, size_t size) + { + return SSL_read(ssl_, ptr, size); + } -inline int SSLSocketStream::write(const char* ptr, size_t size) -{ - return SSL_write(ssl_, ptr, size); -} + inline int SSLSocketStream::write(const char* ptr, size_t size) + { + return SSL_write(ssl_, ptr, size); + } -inline int SSLSocketStream::write(const char* ptr) -{ - return write(ptr, strlen(ptr)); -} + inline int SSLSocketStream::write(const char* ptr) + { + return write(ptr, strlen(ptr)); + } + + inline std::string SSLSocketStream::get_remote_addr() { + return detail::get_remote_addr(sock_); + } // SSL HTTP server implementation -inline SSLServer::SSLServer(const char* cert_path, const char* private_key_path) -{ - ctx_ = SSL_CTX_new(SSLv23_server_method()); + inline SSLServer::SSLServer(const char* cert_path, const char* private_key_path) + { + ctx_ = SSL_CTX_new(SSLv23_server_method()); - if (ctx_) { - SSL_CTX_set_options(ctx_, - SSL_OP_ALL | SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | - SSL_OP_NO_COMPRESSION | - SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); + if (ctx_) { + SSL_CTX_set_options(ctx_, + SSL_OP_ALL | SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 | + SSL_OP_NO_COMPRESSION | + SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION); - // auto ecdh = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1); - // SSL_CTX_set_tmp_ecdh(ctx_, ecdh); - // EC_KEY_free(ecdh); + // auto ecdh = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1); + // SSL_CTX_set_tmp_ecdh(ctx_, ecdh); + // EC_KEY_free(ecdh); - if (SSL_CTX_use_certificate_file(ctx_, cert_path, SSL_FILETYPE_PEM) != 1 || - SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) != 1) { - SSL_CTX_free(ctx_); - ctx_ = nullptr; + if (SSL_CTX_use_certificate_file(ctx_, cert_path, SSL_FILETYPE_PEM) != 1 || + SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) != 1) { + SSL_CTX_free(ctx_); + ctx_ = nullptr; + } } } -} -inline SSLServer::~SSLServer() -{ - if (ctx_) { - SSL_CTX_free(ctx_); + inline SSLServer::~SSLServer() + { + if (ctx_) { + SSL_CTX_free(ctx_); + } } -} -inline bool SSLServer::read_and_close_socket(socket_t sock) -{ - return detail::read_and_close_socket_ssl( - sock, ctx_, - SSL_accept, - [](SSL* ssl) {}, - [this](Stream& strm) { - process_request(strm); - return true; - }); -} - -// SSL HTTP client implementation -inline SSLClient::SSLClient(const char* host, int port) - : Client(host, port) -{ - ctx_ = SSL_CTX_new(SSLv23_client_method()); -} - -inline SSLClient::~SSLClient() -{ - if (ctx_) { - SSL_CTX_free(ctx_); + inline bool SSLServer::is_valid() const + { + return ctx_; } -} -inline bool SSLClient::read_and_close_socket(socket_t sock, const Request& req, Response& res) -{ - return detail::read_and_close_socket_ssl( - sock, ctx_, - SSL_connect, - [&](SSL* ssl) { - SSL_set_tlsext_host_name(ssl, host_.c_str()); - }, - [&](Stream& strm) { - return process_request(strm, req, res); - }); -} + inline bool SSLServer::read_and_close_socket(socket_t sock) + { + return detail::read_and_close_socket_ssl( + sock, + keep_alive_max_count_, + ctx_, ctx_mutex_, + SSL_accept, + [](SSL* /*ssl*/) {}, + [this](Stream& strm, bool last_connection, bool& connection_close) { + return process_request(strm, last_connection, connection_close); + }); + } + + // SSL HTTP client implementation + inline SSLClient::SSLClient(const char* host, int port, size_t timeout_sec) + : Client(host, port, timeout_sec) + { + ctx_ = SSL_CTX_new(SSLv23_client_method()); + } + + inline SSLClient::~SSLClient() + { + if (ctx_) { + SSL_CTX_free(ctx_); + } + } + + inline bool SSLClient::is_valid() const + { + return ctx_; + } + + inline bool SSLClient::read_and_close_socket(socket_t sock, Request& req, Response& res) + { + return is_valid() && detail::read_and_close_socket_ssl( + sock, 0, + ctx_, ctx_mutex_, + SSL_connect, + [&](SSL* ssl) { + SSL_set_tlsext_host_name(ssl, host_.c_str()); + }, + [&](Stream& strm, bool /*last_connection*/, bool& connection_close) { + return process_request(strm, req, res, connection_close); + }); + } #endif } // namespace httplib #endif -// vim: et ts=4 sw=4 cin cino={1s ff=unix +// vim: et ts=4 sw=4 cin cino={1s ff=unix \ No newline at end of file diff --git a/src/App.cpp b/src/App.cpp index c004b180..d3330dc2 100644 --- a/src/App.cpp +++ b/src/App.cpp @@ -113,7 +113,6 @@ App::~App() delete m_network; Options::release(); - Mem::release(); Platform::release(); uv_tty_reset_mode(); @@ -142,12 +141,26 @@ int App::start() background(); - if (!CryptoNight::init(m_options->algo(), m_options->aesni())) { - LOG_ERR("\"%s\" hash self-test failed.", m_options->algoName()); - return EINVAL; + if (Options::i()->colors()) { + LOG_INFO(WHITE_BOLD("%s hash self-test"), m_options->algoName()); + } + else { + LOG_INFO("%s hash self-test", m_options->algoName()); } - Mem::allocate(m_options); + if (!CryptoNight::init(m_options->algo(), m_options->aesni())) { + LOG_ERR("%s hash self-test... failed.", m_options->algoName()); + return EINVAL; + } else { + if (Options::i()->colors()) { + LOG_INFO(WHITE_BOLD("%s hash self-test... ") GREEN_BOLD("successful") ".", m_options->algoName()); + } + else { + LOG_INFO("%s hash self-test... successful.", m_options->algoName()); + } + } + + Mem::init(m_options); Summary::print(); @@ -174,7 +187,7 @@ int App::start() } # endif - Workers::start(m_options->affinity(), m_options->priority()); + Workers::start(m_options->threads(), m_options->affinity(), m_options->priority()); if (m_options->pools().front()->isValid()) { m_network->connect(); diff --git a/src/App_unix.cpp b/src/App_unix.cpp index df90eb26..8511597c 100644 --- a/src/App_unix.cpp +++ b/src/App_unix.cpp @@ -31,10 +31,6 @@ void App::background() { - if (m_options->affinity() != -1L) { - Cpu::setAffinity(-1, m_options->affinity()); - } - if (m_options->background()) { Log::i()->text(Options::i()->colors() ? "\x1B[01;31m\nBackground mode is not supported by %s on *nix Systems. Please use screen/tmux or systemd service instead.\n" diff --git a/src/App_win.cpp b/src/App_win.cpp index 895f3bdf..6b2716f0 100644 --- a/src/App_win.cpp +++ b/src/App_win.cpp @@ -33,10 +33,6 @@ void App::background() { - if (m_options->affinity() != -1L) { - Cpu::setAffinity(-1, m_options->affinity()); - } - if (!m_options->background()) { return; } diff --git a/src/Cpu.cpp b/src/Cpu.cpp index ddd4642d..73fcdfb4 100644 --- a/src/Cpu.cpp +++ b/src/Cpu.cpp @@ -145,9 +145,9 @@ void Cpu::optimizeParameters(size_t& threadsCount, size_t& hashFactor, Options:: CpuImpl::instance().optimizeParameters(threadsCount, hashFactor, algo, maxCpuUsage, safeMode); } -void Cpu::setAffinity(int id, uint64_t mask) +int Cpu::setThreadAffinity(size_t threadId, int64_t affinityMask) { - CpuImpl::instance().setAffinity(id, mask); + return CpuImpl::instance().setThreadAffinity(threadId, affinityMask); } bool Cpu::hasAES() @@ -194,3 +194,24 @@ size_t Cpu::availableCache() { return CpuImpl::instance().availableCache(); } + +int Cpu::getAssignedCpuId(size_t threadId, int64_t affinityMask) +{ + int cpuId = -1; + + Mem::ThreadBitSet threadAffinityMask = Mem::ThreadBitSet(affinityMask); + size_t threadCount = 0; + + for (size_t i = 0; i < CpuImpl::instance().threads(); i++) { + if (threadAffinityMask.test(i)) { + if (threadCount == threadId) { + cpuId = i; + break; + } + + threadCount++; + } + } + + return cpuId; +} diff --git a/src/Cpu.h b/src/Cpu.h index 90bf3e18..a9161d67 100644 --- a/src/Cpu.h +++ b/src/Cpu.h @@ -42,7 +42,7 @@ public: static void optimizeParameters(size_t& threadsCount, size_t& hashFactor, Options::Algo algo, size_t maxCpuUsage, bool safeMode); - static void setAffinity(int id, uint64_t mask); + static int setThreadAffinity(size_t threadId, int64_t affinityMask); static bool hasAES(); static bool isX64(); @@ -53,6 +53,7 @@ public: static size_t sockets(); static size_t threads(); static size_t availableCache(); + static int getAssignedCpuId(size_t threadId, int64_t affinityMask); }; diff --git a/src/CpuImpl.h b/src/CpuImpl.h index 96d3ad4e..b2bec265 100644 --- a/src/CpuImpl.h +++ b/src/CpuImpl.h @@ -29,6 +29,7 @@ #include #include "Options.h" +#include "Mem.h" class CpuImpl { @@ -39,7 +40,7 @@ public: void optimizeParameters(size_t& threadsCount, size_t& hashFactor, Options::Algo algo, size_t maxCpuUsage, bool safeMode); - void setAffinity(int id, uint64_t mask); + int setThreadAffinity(size_t threadId, int64_t affinityMask); bool hasAES(); bool isX64(); diff --git a/src/Cpu_mac.cpp b/src/Cpu_mac.cpp index f82a8924..6bb787e1 100644 --- a/src/Cpu_mac.cpp +++ b/src/Cpu_mac.cpp @@ -22,13 +22,15 @@ */ +#include +#include #include #include #include -#include "Cpu.h" #include "CpuImpl.h" +#include "Cpu.h" void CpuImpl::init() { @@ -39,7 +41,23 @@ void CpuImpl::init() initCommon(); } - -void CpuImpl::setAffinity(int id, uint64_t mask) +int CpuImpl::setThreadAffinity(size_t threadId, int64_t affinityMask) { + int cpuId = -1; + + if (affinityMask != -1L) { + cpuId = Cpu::getAssignedCpuId(threadId, affinityMask); + } else { + cpuId = static_cast(threadId); + } + + if (cpuId > -1) { + thread_port_t mach_thread; + thread_affinity_policy_data_t policy = {static_cast(cpuId)}; + mach_thread = pthread_mach_thread_np(pthread_self()); + + thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t) & policy, 1); + } + + return cpuId; } diff --git a/src/Cpu_unix.cpp b/src/Cpu_unix.cpp index 191bc38e..ea83f56f 100644 --- a/src/Cpu_unix.cpp +++ b/src/Cpu_unix.cpp @@ -35,9 +35,8 @@ #include #include - #include "CpuImpl.h" - +#include "Cpu.h" #ifdef __FreeBSD__ typedef cpuset_t cpu_set_t; @@ -54,26 +53,31 @@ void CpuImpl::init() } -void CpuImpl::setAffinity(int id, uint64_t mask) +int CpuImpl::setThreadAffinity(size_t threadId, int64_t affinityMask) { - cpu_set_t set; - CPU_ZERO(&set); + int cpuId = -1; - for (size_t i = 0; i < threads(); i++) { - if (mask & (1UL << i)) { - CPU_SET(i, &set); - } - } - - if (id == -1) { -# ifndef __FreeBSD__ - sched_setaffinity(0, sizeof(&set), &set); -# endif + if (affinityMask != -1L) { + cpuId = Cpu::getAssignedCpuId(threadId, affinityMask); } else { -# ifndef __ANDROID__ - pthread_setaffinity_np(pthread_self(), sizeof(&set), &set); -# else - sched_setaffinity(gettid(), sizeof(&set), &set); -# endif + cpuId = static_cast(threadId); } + + if (cpuId > -1) { + cpu_set_t mn; + CPU_ZERO(&mn); + CPU_SET(cpuId, &mn); + +# ifndef __ANDROID__ + if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mn) != 0) { + cpuId = -1; + } +# else + if (sched_setaffinity(gettid(), sizeof(cpu_set_t), &mn) == -1) { + cpuId = -1; + } +# endif + } + + return cpuId; } diff --git a/src/Cpu_win.cpp b/src/Cpu_win.cpp index 1560dd64..2a7e74d2 100644 --- a/src/Cpu_win.cpp +++ b/src/Cpu_win.cpp @@ -28,6 +28,7 @@ #include "CpuImpl.h" #include "Mem.h" +#include "Cpu.h" void CpuImpl::init() { @@ -42,24 +43,29 @@ void CpuImpl::init() } -void CpuImpl::setAffinity(int id, uint64_t mask) +int CpuImpl::setThreadAffinity(size_t threadId, int64_t affinityMask) { - if (id == -1) { - SetProcessAffinityMask(GetCurrentProcess(), mask); + int cpuId = -1; + + if (affinityMask != -1L) { + cpuId = Cpu::getAssignedCpuId(threadId, affinityMask); } else { - Mem::ThreadBitSet threadAffinityMask = Mem::ThreadBitSet(mask); - - int threadCount = 0; - - for (size_t i = 0; i < m_totalThreads; i++) { - if (threadAffinityMask.test(i)) { - if (threadCount == id) { - SetThreadAffinityMask(GetCurrentThread(), 1ULL << i); - break; - } - - threadCount++; - } + if (threadId+1 > Cpu::threads()/2) { + cpuId = (threadId - Cpu::threads()/2) + (threadId+1 - Cpu::threads()/2); + } else { + cpuId = threadId * 2; } } + + if (cpuId >= 64) { + cpuId = -1; + } + + if (cpuId > -1) { + if (SetThreadAffinityMask(GetCurrentThread(), 1ULL << cpuId) == 0) { + cpuId = -1; + } + } + + return cpuId; } diff --git a/src/Mem.cpp b/src/Mem.cpp index 522deab4..09cef4f2 100644 --- a/src/Mem.cpp +++ b/src/Mem.cpp @@ -24,25 +24,20 @@ #include - #include "crypto/CryptoNight.h" #include "Mem.h" - -int Mem::m_algo = 0; -int Mem::m_flags = 0; +bool Mem::m_useHugePages = true; size_t Mem::m_hashFactor = 1; -size_t Mem::m_threads = 0; -size_t Mem::m_memorySize = 0; -alignas(16) uint8_t *Mem::m_memory = nullptr; +int Mem::m_flags = 0; +Options::Algo Mem::m_algo = Options::ALGO_CRYPTONIGHT; Mem::ThreadBitSet Mem::m_multiHashThreadMask = Mem::ThreadBitSet(-1L); -cryptonight_ctx *Mem::create(int threadId) +ScratchPadMem Mem::create(ScratchPad** scratchPads, int threadId) { size_t scratchPadSize; - switch (m_algo) - { + switch (m_algo) { case Options::ALGO_CRYPTONIGHT_LITE: scratchPadSize = MEMORY_LITE; break; @@ -55,17 +50,29 @@ cryptonight_ctx *Mem::create(int threadId) break; } - size_t offset = 0; - for (int i=0; i < threadId; i++) { - offset += sizeof(cryptonight_ctx); - offset += scratchPadSize * getThreadHashFactor(i); + ScratchPadMem scratchPadMem; + scratchPadMem.realSize = scratchPadSize * getThreadHashFactor(threadId); + scratchPadMem.size = scratchPadSize * getThreadHashFactor(threadId); + scratchPadMem.size += scratchPadMem.size % MEMORY; + scratchPadMem.pages = scratchPadMem.size / MEMORY; + + allocate(scratchPadMem, m_useHugePages); + + for (size_t i = 0; i < getThreadHashFactor(threadId); ++i) { + ScratchPad* scratchPad = static_cast(_mm_malloc(sizeof(ScratchPad), 4096)); + scratchPad->memory = scratchPadMem.memory + (i * scratchPadSize); + + scratchPads[i] = scratchPad; } - auto* ctx = reinterpret_cast(&m_memory[offset]); - - size_t memOffset = offset+sizeof(cryptonight_ctx); - - ctx->memory = &m_memory[memOffset]; - - return ctx; + return scratchPadMem; } + +void Mem::release(ScratchPad** scratchPads, ScratchPadMem& scratchPadMem, int threadId) +{ + release(scratchPadMem); + + for (size_t i = 0; i < getThreadHashFactor(threadId); ++i) { + _mm_free(scratchPads[i]); + } +} \ No newline at end of file diff --git a/src/Mem.h b/src/Mem.h index e2048162..032d6662 100644 --- a/src/Mem.h +++ b/src/Mem.h @@ -33,22 +33,47 @@ #include "Options.h" -struct cryptonight_ctx; +#ifdef _WIN32 +# ifdef __GNUC__ +# include +# else +# include +# endif +#else +# if defined(XMRIG_ARM) && !defined(__clang__) +# include "aligned_malloc.h" +# else +# include +# endif +#endif + +struct ScratchPad; + +struct ScratchPadMem +{ + alignas(16) uint8_t *memory; + + size_t hugePages; + size_t pages; + size_t size; + size_t realSize; +}; class Mem { public: typedef std::bitset<128> ThreadBitSet; + enum Flags { HugepagesAvailable = 1, HugepagesEnabled = 2, Lock = 4 }; - static bool allocate(const Options* options); - static cryptonight_ctx *create(int threadId); - static void release(); + static void init(const Options* option); + static ScratchPadMem create(ScratchPad** scratchPads, int threadId); + static void release(ScratchPad** scratchPads, ScratchPadMem& scratchPadMem, int threadId); static inline size_t hashFactor() { return m_hashFactor; } static inline size_t getThreadHashFactor(int threadId) @@ -56,19 +81,19 @@ public: return (m_multiHashThreadMask.all() || m_multiHashThreadMask.test(threadId)) ? m_hashFactor : 1; } + static inline bool isHugepagesAvailable() { return (m_flags & HugepagesAvailable) != 0; } - static inline bool isHugepagesEnabled() { return (m_flags & HugepagesEnabled) != 0; } - static inline int flags() { return m_flags; } - static inline size_t threads() { return m_threads; } private: + static void allocate(ScratchPadMem& scratchPadMem, bool useHugePages); + static void release(ScratchPadMem& scratchPadMem); + +private: + static bool m_useHugePages; static size_t m_hashFactor; - static size_t m_threads; - static int m_algo; static int m_flags; + static Options::Algo m_algo; static ThreadBitSet m_multiHashThreadMask; - static size_t m_memorySize; - alignas(16) static uint8_t *m_memory; }; diff --git a/src/Mem_unix.cpp b/src/Mem_unix.cpp index b52c449b..48fd56f3 100644 --- a/src/Mem_unix.cpp +++ b/src/Mem_unix.cpp @@ -25,92 +25,61 @@ #include #include - -#if defined(XMRIG_ARM) && !defined(__clang__) -# include "aligned_malloc.h" -#else -# include -#endif - - #include "crypto/CryptoNight.h" #include "log/Log.h" #include "Mem.h" -bool Mem::allocate(const Options* options) +void Mem::init(const Options* options) { - m_algo = options->algo(); - m_threads = options->threads(); m_hashFactor = options->hashFactor(); - m_multiHashThreadMask = Mem::ThreadBitSet(options->multiHashThreadMask()); - m_memorySize = 0; + m_useHugePages = options->hugePages(); + m_algo = options->algo(); + m_multiHashThreadMask = Mem::ThreadBitSet(static_cast(options->multiHashThreadMask())); +} - size_t scratchPadSize; - switch (m_algo) - { - case Options::ALGO_CRYPTONIGHT_LITE: - scratchPadSize = MEMORY_LITE; - break; - case Options::ALGO_CRYPTONIGHT_HEAVY: - scratchPadSize = MEMORY_HEAVY; - break; - case Options::ALGO_CRYPTONIGHT: - default: - scratchPadSize = MEMORY; - break; +void Mem::allocate(ScratchPadMem& scratchPadMem, bool useHugePages) +{ + scratchPadMem.hugePages = 0; + + if (!useHugePages) { + scratchPadMem.memory = static_cast(_mm_malloc(scratchPadMem.size, 4096)); + return; } - for (size_t i=0; i < m_threads; i++) { - m_memorySize += sizeof(cryptonight_ctx); - m_memorySize += scratchPadSize * getThreadHashFactor(i); - } - - m_memorySize = m_memorySize - (m_memorySize % MEMORY) + MEMORY; - - if (!options->hugePages()) { - m_memory = static_cast(_mm_malloc(m_memorySize, 16)); - return true; - } - - m_flags |= HugepagesAvailable; - # if defined(__APPLE__) - m_memory = static_cast(mmap(0, m_memorySize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0)); + scratchPadMem.memory = static_cast(mmap(0, scratchPadMem.size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0)); # elif defined(__FreeBSD__) - m_memory = static_cast(mmap(0, m_memorySize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0)); + scratchPadMem.memory = static_cast(mmap(0, scratchPadMem.size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0)); # else - m_memory = static_cast(mmap(nullptr, m_memorySize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0)); + scratchPadMem.memory = static_cast(mmap(0, scratchPadMem.size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0)); # endif - if (m_memory == MAP_FAILED) { - m_memory = static_cast(_mm_malloc(m_memorySize, 16)); - return true; + + if (scratchPadMem.memory == MAP_FAILED) { + return allocate(scratchPadMem, false); } - m_flags |= HugepagesEnabled; + scratchPadMem.hugePages = scratchPadMem.pages; - if (madvise(m_memory, m_memorySize, MADV_RANDOM | MADV_WILLNEED) != 0) { + if (madvise(scratchPadMem.memory, scratchPadMem.size, MADV_RANDOM | MADV_WILLNEED) != 0) { LOG_ERR("madvise failed"); } - if (mlock(m_memory, m_memorySize) == 0) { + if (mlock(scratchPadMem.memory, scratchPadMem.size) == 0) { m_flags |= Lock; } - - return true; } - -void Mem::release() +void Mem::release(ScratchPadMem &scratchPadMem) { - if (m_flags & HugepagesEnabled) { + if (scratchPadMem.hugePages) { if (m_flags & Lock) { - munlock(m_memory, m_memorySize); + munlock(scratchPadMem.memory, scratchPadMem.size); } - munmap(m_memory, m_memorySize); + munmap(scratchPadMem.memory, scratchPadMem.size); } else { - _mm_free(m_memory); + _mm_free(scratchPadMem.memory); } } diff --git a/src/Mem_win.cpp b/src/Mem_win.cpp index a52cc5b5..d6ee6ba5 100644 --- a/src/Mem_win.cpp +++ b/src/Mem_win.cpp @@ -27,12 +27,6 @@ #include #include -#ifdef __GNUC__ -# include -#else -# include -#endif - #include "log/Log.h" #include "crypto/CryptoNight.h" #include "Mem.h" @@ -144,63 +138,44 @@ static BOOL TrySetLockPagesPrivilege() { } -bool Mem::allocate(const Options* options) +void Mem::init(const Options* options) { - m_algo = options->algo(); - m_threads = options->threads(); m_hashFactor = options->hashFactor(); - m_multiHashThreadMask = Mem::ThreadBitSet(options->multiHashThreadMask()); - m_memorySize = 0; + m_useHugePages = options->hugePages(); + m_algo = options->algo(); + m_multiHashThreadMask = Mem::ThreadBitSet(static_cast(options->multiHashThreadMask())); - size_t scratchPadSize; - switch (m_algo) - { - case Options::ALGO_CRYPTONIGHT_LITE: - scratchPadSize = MEMORY_LITE; - break; - case Options::ALGO_CRYPTONIGHT_HEAVY: - scratchPadSize = MEMORY_HEAVY; - break; - case Options::ALGO_CRYPTONIGHT: - default: - scratchPadSize = MEMORY; - break; + if (m_useHugePages && TrySetLockPagesPrivilege()) { + m_flags |= HugepagesAvailable; } - - for (size_t i=0; i < m_threads; i++) { - m_memorySize += sizeof(cryptonight_ctx); - m_memorySize += scratchPadSize * getThreadHashFactor(i); - } - - m_memorySize = m_memorySize - (m_memorySize % MEMORY) + MEMORY; - - if (!options->hugePages()) { - m_memory = static_cast(_mm_malloc(m_memorySize, 16)); - return true; - } - - if (TrySetLockPagesPrivilege()) { - m_flags |= HugepagesAvailable; - } - - m_memory = static_cast(VirtualAlloc(NULL, m_memorySize, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE)); - if (!m_memory) { - m_memory = static_cast(_mm_malloc(m_memorySize, 16)); - } - else { - m_flags |= HugepagesEnabled; - } - - return true; } - -void Mem::release() +void Mem::allocate(ScratchPadMem& scratchPadMem, bool useHugePages) { - if (m_flags & HugepagesEnabled) { - VirtualFree(m_memory, 0, MEM_RELEASE); + scratchPadMem.hugePages = 0; + + if (!useHugePages) { + scratchPadMem.memory = static_cast(_mm_malloc(scratchPadMem.size, 4096)); + return; + } + + scratchPadMem.memory = static_cast(VirtualAlloc(nullptr, scratchPadMem.size, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE)); + if (scratchPadMem.memory) { + scratchPadMem.hugePages = scratchPadMem.pages; + + return; + } + + allocate(scratchPadMem, false); +} + + +void Mem::release(ScratchPadMem &scratchPadMem) +{ + if (scratchPadMem.hugePages) { + VirtualFree(scratchPadMem.memory, 0, MEM_RELEASE); } else { - _mm_free(m_memory); + _mm_free(scratchPadMem.memory); } -} +} \ No newline at end of file diff --git a/src/Options.cpp b/src/Options.cpp index 2d6682ab..11812ddd 100644 --- a/src/Options.cpp +++ b/src/Options.cpp @@ -279,7 +279,7 @@ constexpr static const char *pow_variant_names[] = { "auto", "0", "1", - "ipbc", + "tube", "alloy", "xtl", "msr", @@ -984,7 +984,7 @@ bool Options::setAlgo(const char *algo) if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cryptonight-lite-ipbc") || !strcmp(algo, "cryptonight-light-ipbc") || !strcmp(algo, "cn-lite-ipbc"))) { showDeprecateWarning("cryptonight-light-ipbc", "cryptonight-light (with variant \"ipbc\")"); m_algo = ALGO_CRYPTONIGHT_LITE; - m_powVariant = POW_IPBC; + m_powVariant = POW_TUBE; break; } @@ -1025,8 +1025,8 @@ bool Options::parsePowVariant(const char *powVariant) break; } - if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "tube")) { - m_powVariant = POW_IPBC; + if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "ipbc") || !strcmp(powVariant, "bittube"))) { + m_powVariant = POW_TUBE; break; } diff --git a/src/PowVariant.h b/src/PowVariant.h index 59c8813a..fc20c02a 100644 --- a/src/PowVariant.h +++ b/src/PowVariant.h @@ -27,7 +27,7 @@ enum PowVariant POW_AUTODETECT, POW_V0, POW_V1, - POW_IPBC, + POW_TUBE, POW_ALLOY, POW_XTL, POW_MSR, @@ -44,8 +44,8 @@ inline std::string getPowVariantName(PowVariant powVariant) return "0"; case POW_V1: return "1"; - case POW_IPBC: - return "ipbc"; + case POW_TUBE: + return "tube"; case POW_ALLOY: return "alloy"; case POW_XTL: @@ -104,8 +104,8 @@ inline PowVariant parseVariant(const std::string variant) powVariant = PowVariant::POW_V0; } else if (variant == "1") { powVariant = PowVariant::POW_V1; - } else if (variant == "ipbc" || variant == "tube") { - powVariant = PowVariant::POW_IPBC; + } else if (variant == "ipbc" || variant == "tube" || variant == "bittube") { + powVariant = PowVariant::POW_TUBE; } else if (variant == "xao" || variant == "alloy") { powVariant = PowVariant::POW_ALLOY; } else if (variant == "xtl" || variant == "stellite") { diff --git a/src/Summary.cpp b/src/Summary.cpp index 0bb6386b..cfad1e14 100644 --- a/src/Summary.cpp +++ b/src/Summary.cpp @@ -56,18 +56,6 @@ static void print_versions() } -static void print_memory() { - if (Options::i()->colors()) { - Log::i()->text("\x1B[01;32m * \x1B[01;37mHUGE PAGES: %s, %s", - Mem::isHugepagesAvailable() ? "\x1B[01;32mavailable" : "\x1B[01;31munavailable", - Mem::isHugepagesEnabled() ? "\x1B[01;32menabled" : "\x1B[01;31mdisabled"); - } - else { - Log::i()->text(" * HUGE PAGES: %s, %s", Mem::isHugepagesAvailable() ? "available" : "unavailable", Mem::isHugepagesEnabled() ? "enabled" : "disabled"); - } -} - - static void print_cpu() { if (Options::i()->colors()) { @@ -125,14 +113,15 @@ static void print_threads() snprintf(affBuf, 32, ", affinity=0x%" PRIX64, Options::i()->affinity()); } else { - affBuf[0] = '\0'; + snprintf(affBuf, 32, ", affinity=auto"); } Log::i()->text(Options::i()->colors() ? - "\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, aes=%d, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" : - " * THREADS: %d, %s, aes=%d, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s", + "\x1B[01;32m * \x1B[01;37mTHREADS: \x1B[01;36m%d\x1B[01;37m, %s, %saes=%d\x1B[01;37m, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" : + " * THREADS: %d, %s, %saes=%d, hf=%zu, %sdonate=%d%%%s%s", Options::i()->threads(), Options::i()->algoName(), + Options::i()->colors() && Options::i()->aesni() == 0 ? "\x1B[01;31m" : "", Options::i()->aesni(), Options::i()->hashFactor(), Options::i()->colors() && Options::i()->donateLevel() == 0 ? "\x1B[01;31m" : "", @@ -201,7 +190,6 @@ static void print_commands() void Summary::print() { print_versions(); - print_memory(); print_cpu(); print_threads(); print_pools(); diff --git a/src/api/ApiState.cpp b/src/api/ApiState.cpp index c963a1d6..2e85a1ac 100644 --- a/src/api/ApiState.cpp +++ b/src/api/ApiState.cpp @@ -231,7 +231,7 @@ void ApiState::getMiner(rapidjson::Document &doc) const doc.AddMember("ua", rapidjson::StringRef(Platform::userAgent()), allocator); doc.AddMember("cpu", cpu, allocator); doc.AddMember("algo", rapidjson::StringRef(Options::i()->algoName()), allocator); - doc.AddMember("hugepages", Mem::isHugepagesEnabled(), allocator); + doc.AddMember("hugepages", Mem::isHugepagesAvailable(), allocator); doc.AddMember("donate_level", Options::i()->donateLevel(), allocator); } diff --git a/src/cc/CCClient.cpp b/src/cc/CCClient.cpp index 55550576..4dc15b1b 100644 --- a/src/cc/CCClient.cpp +++ b/src/cc/CCClient.cpp @@ -79,7 +79,6 @@ CCClient::CCClient(Options* options, uv_async_t* async) m_clientStatus.setCurrentAlgoName(m_options->algoName()); } - m_clientStatus.setHugepagesEnabled(Mem::isHugepagesEnabled()); m_clientStatus.setHugepages(Mem::isHugepagesAvailable()); m_clientStatus.setHashFactor(Mem::hashFactor()); @@ -265,10 +264,10 @@ std::shared_ptr CCClient::performRequest(const std::string& r # ifndef XMRIG_NO_TLS if (m_self->m_options->ccUseTls()) { - cli = std::make_shared(m_self->m_options->ccHost(), m_self->m_options->ccPort()); + cli = std::make_shared(m_self->m_options->ccHost(), m_self->m_options->ccPort(), 10); } else { # endif - cli = std::make_shared(m_self->m_options->ccHost(), m_self->m_options->ccPort()); + cli = std::make_shared(m_self->m_options->ccHost(), m_self->m_options->ccPort(), 10); # ifndef XMRIG_NO_TLS } # endif diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp index e0f90361..bf9b0b08 100644 --- a/src/crypto/CryptoNight.cpp +++ b/src/crypto/CryptoNight.cpp @@ -34,88 +34,94 @@ #include "crypto/CryptoNight_test.h" template -static void cryptonight_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { +static void cryptonight_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) if (powVersion == PowVariant::POW_V1) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_ALLOY) { - CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, ctx); + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_XTL) { - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_MSR) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_RTO) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); }else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } # endif } template -static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { +static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { if (powVersion == PowVariant::POW_V1) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_ALLOY) { - CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, ctx); + CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_XTL) { - CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_MSR) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); } else if (powVersion == PowVariant::POW_RTO) { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, ctx); + CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } } template -static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { +static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) if (powVersion == PowVariant::POW_V1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); - } else if (powVersion == PowVariant::POW_IPBC) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } else if (powVersion == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } # endif } template -static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { +static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { if (powVersion == PowVariant::POW_V1) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx); - } else if (powVersion == PowVariant::POW_IPBC) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad); + } else if (powVersion == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad); } } template -static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { +static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { # if !defined(XMRIG_ARMv7) if (powVersion == PowVariant::POW_XHV) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); + } + else if (powVersion == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } # endif } template -static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) { +static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { if (powVersion == PowVariant::POW_XHV) { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad); + } + else if (powVersion == PowVariant::POW_TUBE) { + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad); } else { - CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, ctx); + CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad); } } -void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx); +void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad); template void setCryptoNightHashMethods(Options::Algo algo, bool aesni) @@ -161,9 +167,9 @@ bool CryptoNight::init(int algo, bool aesni) return selfTest(algo); } -void CryptoNight::hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx* ctx) +void CryptoNight::hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) { - cryptonight_hash_ctx[factor-1](powVersion, input, size, output, ctx); + cryptonight_hash_ctx[factor-1](powVersion, input, size, output, scratchPad); } bool CryptoNight::selfTest(int algo) @@ -187,8 +193,14 @@ bool CryptoNight::selfTest(int algo) uint8_t output[160]; - auto ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16); - ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 6, 16); + ScratchPad* scratchPads [MAX_NUM_HASH_BLOCKS]; + + for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) { + ScratchPad* scratchPad = static_cast(_mm_malloc(sizeof(ScratchPad), 4096)); + scratchPad->memory = (uint8_t *) _mm_malloc(MEMORY * 6, 16); + + scratchPads[i] = scratchPad; + } bool result = true; bool resultLite = true; @@ -197,188 +209,206 @@ bool CryptoNight::selfTest(int algo) if (algo == Options::ALGO_CRYPTONIGHT_HEAVY) { // cn-heavy - cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 96) == 0; #endif // cn-heavy haven - cryptonight_hash_ctx[0](PowVariant::POW_XHV, test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_XHV, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_XHV, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_XHV, test_input, 76, output, scratchPads); resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 96) == 0; #endif + + // cn-heavy bittube + + cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 32) == 0; + + #if MAX_NUM_HASH_BLOCKS > 1 + cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 64) == 0; + #endif + + #if MAX_NUM_HASH_BLOCKS > 2 + cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); + resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 96) == 0; + #endif } else if (algo == Options::ALGO_CRYPTONIGHT_LITE) { // cn-lite v0 - cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v0_lite, 160) == 0; #endif // cn-lite v7 tests - cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_v1_lite, 160) == 0; #endif // cn-lite ibpc tests - cryptonight_hash_ctx[0](PowVariant::POW_IPBC, test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_IPBC, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_IPBC, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_IPBC, test_input, 76, output, ctx); + cryptonight_hash_ctx[3](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_IPBC, test_input, 76, output, ctx); + cryptonight_hash_ctx[4](PowVariant::POW_TUBE, test_input, 76, output, scratchPads); resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 160) == 0; #endif } else { // cn v0 - cryptonight_hash_ctx[0](PowVariant::POW_V0,test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_V0,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, ctx); + cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v0, 160) == 0; #endif // cn v7 - cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, ctx); + cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_v1, 160) == 0; #endif // cn xtl - cryptonight_hash_ctx[0](PowVariant::POW_XTL,test_input, 76, output, ctx); + cryptonight_hash_ctx[0](PowVariant::POW_XTL,test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 32) == 0; #if MAX_NUM_HASH_BLOCKS > 1 - cryptonight_hash_ctx[1](PowVariant::POW_XTL, test_input, 76, output, ctx); + cryptonight_hash_ctx[1](PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 64) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 2 - cryptonight_hash_ctx[2](PowVariant::POW_XTL, test_input, 76, output, ctx); + cryptonight_hash_ctx[2](PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 96) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 3 - cryptonight_hash_ctx[3](PowVariant::POW_XTL, test_input, 76, output, ctx); + cryptonight_hash_ctx[3](PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 128) == 0; #endif #if MAX_NUM_HASH_BLOCKS > 4 - cryptonight_hash_ctx[4](PowVariant::POW_XTL, test_input, 76, output, ctx); + cryptonight_hash_ctx[4](PowVariant::POW_XTL, test_input, 76, output, scratchPads); result = result && memcmp(output, test_output_xtl, 160) == 0; #endif } - _mm_free(ctx->memory); - _mm_free(ctx); + + for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) { + _mm_free(scratchPads[i]->memory); + _mm_free(scratchPads[i]); + } return result && resultLite & resultHeavy; } \ No newline at end of file diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h index 4032802d..753c56fc 100644 --- a/src/crypto/CryptoNight.h +++ b/src/crypto/CryptoNight.h @@ -37,8 +37,8 @@ #define POW_DEFAULT_INDEX_SHIFT 3 #define POW_XLT_V4_INDEX_SHIFT 4 -struct cryptonight_ctx { - alignas(16) uint8_t state[MAX_NUM_HASH_BLOCKS][208]; // 208 instead of 200 to maintain aligned to 16 byte boundaries +struct ScratchPad { + alignas(16) uint8_t state[208]; // 208 instead of 200 to maintain aligned to 16 byte boundaries alignas(16) uint8_t* memory; }; @@ -51,7 +51,7 @@ class CryptoNight public: static bool init(int algo, bool aesni); - static void hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx* ctx); + static void hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads); private: static bool selfTest(int algo); diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h index c823d00f..377c0002 100644 --- a/src/crypto/CryptoNight_arm.h +++ b/src/crypto/CryptoNight_arm.h @@ -31,9 +31,12 @@ #if defined(XMRIG_ARM) && !defined(__clang__) # include "aligned_malloc.h" #else + # include + #endif +#include #include "crypto/CryptoNight.h" #include "crypto/soft_aes.h" @@ -48,27 +51,32 @@ extern "C" #include "crypto/c_skein.h" } -static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) { +static inline void do_blake_hash(const uint8_t* input, size_t len, uint8_t* output) +{ blake256_hash(output, input, len); } -static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) { +static inline void do_groestl_hash(const uint8_t* input, size_t len, uint8_t* output) +{ groestl(input, len * 8, output); } -static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) { +static inline void do_jh_hash(const uint8_t* input, size_t len, uint8_t* output) +{ jh_hash(32 * 8, input, 8 * len, output); } -static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) { +static inline void do_skein_hash(const uint8_t* input, size_t len, uint8_t* output) +{ xmr_skein(input, output); } -void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash}; +void (* const extra_hashes[4])(const uint8_t*, size_t, uint8_t*) = {do_blake_hash, do_groestl_hash, do_jh_hash, + do_skein_hash}; static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64_t a, const uint64_t b) @@ -76,6 +84,21 @@ static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64 return vcombine_u64(vcreate_u64(b), vcreate_u64(a)); } +#if __ARM_FEATURE_CRYPTO +static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) +{ + alignas(16) const __m128i zero = { 0 }; + return veorq_u8(vaesmcq_u8(vaeseq_u8(v, zero)), rkey ); +} +#else + +static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey) +{ + alignas(16) const __m128i zero = {0}; + return zero; +} + +#endif /* this one was not implemented yet so here it is */ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i a) @@ -87,7 +110,7 @@ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i #define EXTRACT64(X) _mm_cvtsi128_si64(X) -#if defined(XMRIG_ARMv8) +#if defined (__arm64__) || defined (__aarch64__) static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi) { unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b; @@ -95,7 +118,9 @@ static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi) return (uint64_t) r; } #else -static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) { + +static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi) +{ // multiplier = ab = a * 2^32 + b // multiplicand = cd = c * 2^32 + d // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d @@ -119,6 +144,7 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin return product_lo; } + #endif @@ -141,44 +167,48 @@ template static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2) { __m128i xout1 = soft_aeskeygenassist(*xout2); - xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem + xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem *xout0 = sl_xor(*xout0); *xout0 = _mm_xor_si128(*xout0, xout1); - xout1 = soft_aeskeygenassist<0x00>(*xout0); - xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem + xout1 = soft_aeskeygenassist<0x00>(*xout0); + xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem *xout2 = sl_xor(*xout2); *xout2 = _mm_xor_si128(*xout2, xout1); } template -static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) +static inline void +aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, + __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9) { __m128i xout0 = _mm_load_si128(memory); __m128i xout2 = _mm_load_si128(memory + 1); *k0 = xout0; *k1 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x01>(&xout0, &xout2) : soft_aes_genkey_sub<0x01>(&xout0, &xout2); + soft_aes_genkey_sub<0x01>(&xout0, &xout2); *k2 = xout0; *k3 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x02>(&xout0, &xout2) : soft_aes_genkey_sub<0x02>(&xout0, &xout2); + soft_aes_genkey_sub<0x02>(&xout0, &xout2); *k4 = xout0; *k5 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x04>(&xout0, &xout2) : soft_aes_genkey_sub<0x04>(&xout0, &xout2); + soft_aes_genkey_sub<0x04>(&xout0, &xout2); *k6 = xout0; *k7 = xout2; - SOFT_AES ? soft_aes_genkey_sub<0x08>(&xout0, &xout2) : soft_aes_genkey_sub<0x08>(&xout0, &xout2); + soft_aes_genkey_sub<0x08>(&xout0, &xout2); *k8 = xout0; *k9 = xout2; } template -static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7) +static inline void +aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, + __m128i* x7) { if (SOFT_AES) { *x0 = soft_aesenc((uint32_t*)x0, key); @@ -201,11 +231,23 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, *x6 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x6), key)); *x7 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x7), key)); } +# else + else { + *x0 = _mm_aesenc_si128(*x0, key); + *x1 = _mm_aesenc_si128(*x1, key); + *x2 = _mm_aesenc_si128(*x2, key); + *x3 = _mm_aesenc_si128(*x3, key); + *x4 = _mm_aesenc_si128(*x4, key); + *x5 = _mm_aesenc_si128(*x5, key); + *x6 = _mm_aesenc_si128(*x6, key); + *x7 = _mm_aesenc_si128(*x7, key); + } # endif } -inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7) +inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, + __m128i& x7) { __m128i tmp0 = x0; x0 = _mm_xor_si128(x0, x1); @@ -292,6 +334,7 @@ static inline void cn_explode_scratchpad_heavy(const __m128i* input, __m128i* ou xin7 = _mm_load_si128(input + 11); for (size_t i = 0; i < 16; i++) { + if (!SOFT_AES) { aes_round(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); } @@ -315,8 +358,7 @@ static inline void cn_explode_scratchpad_heavy(const __m128i* input, __m128i* ou xin5 ^= k9; xin6 ^= k9; xin7 ^= k9; - } - else { + } else { aes_round(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7); } @@ -444,8 +486,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou xout6 = _mm_load_si128(output + 10); xout7 = _mm_load_si128(output + 11); - for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) - { + for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) { xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0); xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1); xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2); @@ -478,8 +519,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou xout5 ^= k9; xout6 ^= k9; xout7 ^= k9; - } - else { + } else { aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } @@ -519,8 +559,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou xout5 ^= k9; xout6 ^= k9; xout7 ^= k9; - } - else { + } else { aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } @@ -551,8 +590,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou xout5 ^= k9; xout6 ^= k9; xout7 ^= k9; - } - else { + } else { aes_round(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7); } @@ -576,8 +614,8 @@ class CryptoNightMultiHash public: inline static void hash(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -588,12 +626,12 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, - ctx->state[hashBlock], 200); + scratchPad[hashBlock]->state, 200); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -608,12 +646,11 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } else { - cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); -# endif + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], @@ -642,15 +679,15 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, - output + hashBlock * 32); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); } } inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -661,14 +698,15 @@ public: uint64_t tweak1_2[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, + 200); tweak1_2[hashBlock] = (*reinterpret_cast(input + 35 + hashBlock * size) ^ - *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -684,12 +722,11 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } else { - cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); -# endif + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], @@ -698,7 +735,7 @@ public: const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx[hashBlock] = EXTRACT64(cx); bx[hashBlock] = cx; @@ -727,15 +764,15 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, - output + hashBlock * 32); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); } } - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -746,14 +783,15 @@ public: uint64_t tweak1_2[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, + 200); tweak1_2[hashBlock] = (*reinterpret_cast(input + 35 + hashBlock * size) ^ - *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -769,12 +807,11 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } else { - cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); -# endif + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], @@ -783,7 +820,7 @@ public: const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx[hashBlock] = EXTRACT64(cx); bx[hashBlock] = cx; @@ -803,7 +840,8 @@ public: ah[hashBlock] ^= tweak1_2[hashBlock]; - ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*) &l[hashBlock][idx[hashBlock] & + MASK])[0]; ah[hashBlock] ^= ch; al[hashBlock] ^= cl; @@ -814,15 +852,15 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, - output + hashBlock * 32); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); } } inline static void hashHeavy(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -833,12 +871,12 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, - ctx->state[hashBlock], 200); + scratchPad[hashBlock]->state, 200); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -853,12 +891,11 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } else { - cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); -# endif + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], @@ -882,11 +919,13 @@ public: al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; - int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; - int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x = vld1q_s64(reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])); + const int64_t n = vgetq_lane_s64(x, 0); + const int32_t d = vgetq_lane_s32(x, 2); + const int64_t q = n / (d | 0x5); + + ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; idx[hashBlock] = d ^ q; } } @@ -894,15 +933,15 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, - output + hashBlock * 32); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); } } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -913,12 +952,12 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { keccak(static_cast(input) + hashBlock * size, (int) size, - ctx->state[hashBlock], 200); + scratchPad[hashBlock]->state, 200); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -933,12 +972,11 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock])); + cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], + _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } else { - cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]); -# endif + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock])); } _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], @@ -962,11 +1000,13 @@ public: al[hashBlock] ^= cl; idx[hashBlock] = al[hashBlock]; - int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; - int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x = vld1q_s64(reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])); + const int64_t n = vgetq_lane_s64(x, 0); + const int32_t d = vgetq_lane_s32(x, 2); + const int64_t q = n / (d | 0x5); + + ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; - ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; idx[hashBlock] = (~d) ^ q; } } @@ -974,8 +1014,129 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, - output + hashBlock * 32); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); + } + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t tweak1_2[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, + 200); + tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + + hashBlock * size) ^ + *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); + + cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + } + + union alignas(16) + { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + __m128i cx; + + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + + const __m128i& key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); + + _mm_store_si128((__m128i*) k, key); + cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*) x, cx); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; + + cx = _mm_load_si128((__m128i*) k); + + _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); + + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx[hashBlock] = EXTRACT64(cx); + bx[hashBlock] = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + lo = __umul128(idx[hashBlock], cl, &hi); + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ah[hashBlock] ^= tweak1_2[hashBlock]; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= tweak1_2[hashBlock]; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*) &l[hashBlock][idx[hashBlock] & + MASK])[0]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; + + const int64x2_t x = vld1q_s64(reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])); + const int64_t n = vgetq_lane_s64(x, 0); + const int32_t d = vgetq_lane_s32(x, 2); + const int64_t q = n / (d | 0x5); + + ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; + + idx[hashBlock] = d ^ q; + } + } + +#undef BYTE + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); } } }; @@ -986,8 +1147,8 @@ class CryptoNightMultiHash public: inline static void hash(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -996,10 +1157,10 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad((__m128i*) h, (__m128i*) l); @@ -1012,13 +1173,10 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); - } - else { - cx = _mm_load_si128((__m128i *) &l[idx & MASK]); - # ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al); - # endif + cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al)); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); } _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); @@ -1043,83 +1201,13 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - } - - inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) - { - const uint8_t* l; - uint64_t* h; - uint64_t al; - uint64_t ah; - __m128i bx; - uint64_t idx; - - keccak(static_cast(input), (int) size, ctx->state[0], 200); - - uint64_t tweak1_2 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); - - cn_explode_scratchpad((__m128i*) h, (__m128i*) l); - - al = h[0] ^ h[4]; - ah = h[1] ^ h[5]; - bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); - idx = h[0] ^ h[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx; - - if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); - } - else { - cx = _mm_load_si128((__m128i *) &l[idx & MASK]); - # ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al); - # endif - } - - _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); - const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; - static const uint32_t table = 0x75310; - const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - idx = EXTRACT64(cx); - bx = cx; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l[idx & MASK])[0]; - ch = ((uint64_t*) &l[idx & MASK])[1]; - lo = __umul128(idx, cl, &hi); - - al += hi; - ah += lo; - - ah ^= tweak1_2; - ((uint64_t*) &l[idx & MASK])[0] = al; - ((uint64_t*) &l[idx & MASK])[1] = ah; - ah ^= tweak1_2; - - ah ^= ch; - al ^= cl; - idx = al; + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } - cn_implode_scratchpad((__m128i*) l, (__m128i*) h); - keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - } - - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -1128,12 +1216,12 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); uint64_t tweak1_2 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + *(reinterpret_cast(scratchPad[0]->state) + 24)); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad((__m128i*) h, (__m128i*) l); @@ -1146,20 +1234,17 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); - } - else { - cx = _mm_load_si128((__m128i *) &l[idx & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al); -# endif + cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al)); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); } _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx = EXTRACT64(cx); bx = cx; @@ -1176,8 +1261,6 @@ public: ((uint64_t*) &l[idx & MASK])[1] = ah; ah ^= tweak1_2; - ((uint64_t*)&l[idx & MASK])[1] ^= ((uint64_t*)&l[idx & MASK])[0]; - ah ^= ch; al ^= cl; idx = al; @@ -1185,13 +1268,82 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l; + uint64_t* h; + uint64_t al; + uint64_t ah; + __m128i bx; + uint64_t idx; + + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + + uint64_t tweak1_2 = (*reinterpret_cast(input + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad((__m128i*) h, (__m128i*) l); + + al = h[0] ^ h[4]; + ah = h[1] ^ h[5]; + bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + idx = h[0] ^ h[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx; + + if (SOFT_AES) { + cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al)); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); + } + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + idx = EXTRACT64(cx); + bx = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + lo = __umul128(idx, cl, &hi); + + al += hi; + ah += lo; + + ah ^= tweak1_2; + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + ah ^= tweak1_2; + + ((uint64_t*) &l[idx & MASK])[1] ^= ((uint64_t*) &l[idx & MASK])[0]; + + ah ^= ch; + al ^= cl; + idx = al; + } + + cn_implode_scratchpad((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } inline static void hashHeavy(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -1200,12 +1352,12 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + cn_explode_scratchpad_heavy((__m128i*) scratchPad[0]->state, (__m128i*) scratchPad[0]->memory); - cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); al = h[0] ^ h[4]; ah = h[1] ^ h[5]; @@ -1216,13 +1368,10 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); - } - else { - cx = _mm_load_si128((__m128i *) &l[idx & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al); -# endif + cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al)); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); } _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); @@ -1244,23 +1393,25 @@ public: al ^= cl; idx = al; - int64_t n = ((int64_t*)&l[idx & MASK])[0]; - int32_t d = ((int32_t*)&l[idx & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x = vld1q_s64(reinterpret_cast(&l[idx & MASK])); + const int64_t n = vgetq_lane_s64(x, 0); + const int32_t d = vgetq_lane_s32(x, 2); + const int64_t q = n / (d | 0x5); + + ((int64_t*) &l[idx & MASK])[0] = n ^ q; - ((int64_t*)&l[idx & MASK])[0] = n ^ q; idx = d ^ q; } - cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); + cn_implode_scratchpad_heavy((__m128i*) scratchPad[0]->memory, (__m128i*) scratchPad[0]->state); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -1269,10 +1420,10 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); @@ -1285,13 +1436,10 @@ public: __m128i cx; if (SOFT_AES) { - cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al)); - } - else { - cx = _mm_load_si128((__m128i *) &l[idx & MASK]); -# ifndef XMRIG_ARMv7 - cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al); -# endif + cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al)); + } else { + cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al)); } _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); @@ -1313,17 +1461,122 @@ public: al ^= cl; idx = al; - int64_t n = ((int64_t*)&l[idx & MASK])[0]; - int32_t d = ((int32_t*)&l[idx & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x = vld1q_s64(reinterpret_cast(&l[idx & MASK])); + const int64_t n = vgetq_lane_s64(x, 0); + const int32_t d = vgetq_lane_s32(x, 2); + const int64_t q = n / (d | 0x5); + + ((int64_t*) &l[idx & MASK])[0] = n ^ q; - ((int64_t*)&l[idx & MASK])[0] = n ^ q; idx = (~d) ^ q; } cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l; + uint64_t* h; + uint64_t al; + uint64_t ah; + __m128i bx; + uint64_t idx; + + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + + uint64_t tweak1_2 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); + + al = h[0] ^ h[4]; + ah = h[1] ^ h[5]; + bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + idx = h[0] ^ h[4]; + + union alignas(16) + { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + + const __m128i& key = _mm_set_epi64x(ah, al); + + _mm_store_si128((__m128i*) k, key); + cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*) x, cx); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; + + cx = _mm_load_si128((__m128i*) k); + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx = EXTRACT64(cx); + bx = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + lo = __umul128(idx, cl, &hi); + + al += hi; + ah += lo; + + ah ^= tweak1_2; + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + ah ^= tweak1_2; + + ((uint64_t*) &l[idx & MASK])[1] ^= ((uint64_t*) &l[idx & MASK])[0]; + + ah ^= ch; + al ^= cl; + idx = al; + + const int64x2_t x = vld1q_s64(reinterpret_cast(&l[idx & MASK])); + const int64_t n = vgetq_lane_s64(x, 0); + const int32_t d = vgetq_lane_s32(x, 2); + const int64_t q = n / (d | 0x5); + + ((int64_t*) &l[idx & MASK])[0] = n ^ q; + + idx = d ^ q; + } +#undef BYTE + + cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } }; @@ -1333,16 +1586,16 @@ class CryptoNightMultiHash public: inline static void hash(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -1363,16 +1616,14 @@ public: __m128i cx1; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); } else { cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -1420,137 +1671,27 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } - inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) - { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - - uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); - - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - } else { - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); -# endif - } - - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - - static const uint32_t table = 0x75310; - uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; - uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - - bx0 = cx0; - bx1 = cx1; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ah0 ^= tweak1_2_0; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah0 ^= tweak1_2_0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - - al1 += hi; - ah1 += lo; - - ah1 ^= tweak1_2_1; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - ah1 ^= tweak1_2_1; - - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; - } - - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - - keccakf(h0, 24); - keccakf(h1, 24); - - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - } - - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -1571,16 +1712,14 @@ public: __m128i cx1; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); } else { cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -1589,10 +1728,10 @@ public: static const uint32_t table = 0x75310; uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); @@ -1613,7 +1752,115 @@ public: ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; ah0 ^= tweak1_2_0; - ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0]; + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0]; ah0 ^= ch; al0 ^= cl; @@ -1631,7 +1878,7 @@ public: ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; ah1 ^= tweak1_2_1; - ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0]; + ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0]; ah1 ^= ch; al1 ^= cl; @@ -1644,22 +1891,22 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } inline static void hashHeavy(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -1680,16 +1927,14 @@ public: __m128i cx1; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); } else { cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -1716,12 +1961,15 @@ public: al0 ^= cl; idx0 = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); + const int64_t n0 = vgetq_lane_s64(x0, 0); + const int32_t d0 = vgetq_lane_s32(x0, 2); + const int64_t q0 = n0 / (d0 | 0x5); + + ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; + + idx0 = d0 ^ q0; - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; @@ -1737,12 +1985,14 @@ public: al1 ^= cl; idx1 = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; - q = n / (d | 0x5); + const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); + const int64_t n1 = vgetq_lane_s64(x1, 0); + const int32_t d1 = vgetq_lane_s32(x1, 2); + const int64_t q1 = n1 / (d1 | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; + ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; + + idx1 = d1 ^ q1; } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -1751,22 +2001,22 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -1787,16 +2037,14 @@ public: __m128i cx1; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); } else { cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -1823,12 +2071,14 @@ public: al0 ^= cl; idx0 = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); + const int64_t n0 = vgetq_lane_s64(x0, 0); + const int32_t d0 = vgetq_lane_s32(x0, 2); + const int64_t q0 = n0 / (d0 | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; + ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; + + idx0 = (~d0) ^ q0; cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; @@ -1844,12 +2094,14 @@ public: al1 ^= cl; idx1 = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; - q = n / (d | 0x5); + const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); + const int64_t n1 = vgetq_lane_s64(x1, 0); + const int32_t d1 = vgetq_lane_s32(x1, 2); + const int64_t q1 = n1 / (d1 | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = (~d) ^ q; + ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; + + idx1 = (~d1) ^ q1; } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -1858,8 +2110,178 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + union alignas(16) + { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + const __m128i& key0 = _mm_set_epi64x(ah0, al0); + + _mm_store_si128((__m128i*) k, key0); + cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*) x, cx0); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; + + cx0 = _mm_load_si128((__m128i*) k); + + const __m128i& key1 = _mm_set_epi64x(ah1, al1); + + _mm_store_si128((__m128i*) k, key1); + cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*) x, cx1); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; + + cx1 = _mm_load_si128((__m128i*) k); + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0]; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + + const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); + const int64_t n0 = vgetq_lane_s64(x0, 0); + const int32_t d0 = vgetq_lane_s32(x0, 2); + const int64_t q0 = n0 / (d0 | 0x5); + + ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; + + idx0 = d0 ^ q0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0]; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); + const int64_t n1 = vgetq_lane_s64(x1, 0); + const int32_t d1 = vgetq_lane_s32(x1, 2); + const int64_t q1 = n1 / (d1 | 0x5); + + ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; + + idx1 = d1 ^ q1; + } +#undef BYTE + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } }; @@ -1869,19 +2291,19 @@ class CryptoNightMultiHash public: inline static void hash(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -1902,25 +2324,23 @@ public: uint64_t idx1 = h1[0] ^h1[4]; uint64_t idx2 = h2[0] ^h2[4]; - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - } - else { - cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); -# endif + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -1990,185 +2410,33 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } - inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) - { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); - - uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); - - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - } - else { - cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); -# endif - } - - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); - - static const uint32_t table = 0x75310; - uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; - uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); - - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; - - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ah0 ^= tweak1_2_0; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah0 ^= tweak1_2_0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - - al1 += hi; - ah1 += lo; - - ah1 ^= tweak1_2_1; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - ah1 ^= tweak1_2_1; - - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; - - - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); - - al2 += hi; - ah2 += lo; - - ah2 ^= tweak1_2_2; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; - ah2 ^= tweak1_2_2; - - ah2 ^= ch; - al2 ^= cl; - idx2 = al2; - } - - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); - - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - } - - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -2195,19 +2463,17 @@ public: __m128i cx2; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - } - else { - cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); -# endif + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -2217,13 +2483,13 @@ public: static const uint32_t table = 0x75310; uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); @@ -2247,8 +2513,6 @@ public: ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; ah0 ^= tweak1_2_0; - ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0]; - ah0 ^= ch; al0 ^= cl; idx0 = al0; @@ -2266,8 +2530,6 @@ public: ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; ah1 ^= tweak1_2_1; - ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0]; - ah1 ^= ch; al1 ^= cl; idx1 = al1; @@ -2285,8 +2547,6 @@ public: ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; ah2 ^= tweak1_2_2; - ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0]; - ah2 ^= ch; al2 ^= cl; idx2 = al2; @@ -2300,26 +2560,182 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0]; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0]; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; + + ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0]; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } inline static void hashHeavy(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -2346,19 +2762,17 @@ public: __m128i cx2; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - } - else { - cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); -# endif + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -2389,13 +2803,14 @@ public: al0 ^= cl; idx0 = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); + const int64_t n0 = vgetq_lane_s64(x0, 0); + const int32_t d0 = vgetq_lane_s32(x0, 2); + const int64_t q0 = n0 / (d0 | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = d ^ q; + ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; + idx0 = d0 ^ q0; cl = ((uint64_t*) &l1[idx1 & MASK])[0]; ch = ((uint64_t*) &l1[idx1 & MASK])[1]; @@ -2411,12 +2826,14 @@ public: al1 ^= cl; idx1 = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; - q = n / (d | 0x5); + const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); + const int64_t n1 = vgetq_lane_s64(x1, 0); + const int32_t d1 = vgetq_lane_s32(x1, 2); + const int64_t q1 = n1 / (d1 | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = d ^ q; + ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; + + idx1 = d1 ^ q1; cl = ((uint64_t*) &l2[idx2 & MASK])[0]; @@ -2433,12 +2850,15 @@ public: al2 ^= cl; idx2 = al2; - n = ((int64_t*)&l2[idx2 & MASK])[0]; - d = ((int32_t*)&l2[idx2 & MASK])[2]; - q = n / (d | 0x5); - ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; - idx2 = d ^ q; + const int64x2_t x2 = vld1q_s64(reinterpret_cast(&l2[idx2 & MASK])); + const int64_t n2 = vgetq_lane_s64(x2, 0); + const int32_t d2 = vgetq_lane_s32(x2, 2); + const int64_t q2 = n2 / (d2 | 0x5); + + ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2; + + idx2 = d2 ^ q2; } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -2449,26 +2869,26 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -2495,19 +2915,17 @@ public: __m128i cx2; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - } - else { - cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]); -# ifndef XMRIG_ARMv7 - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); -# endif + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -2538,12 +2956,14 @@ public: al0 ^= cl; idx0 = al0; - int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; - int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; - int64_t q = n / (d | 0x5); + const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); + const int64_t n0 = vgetq_lane_s64(x0, 0); + const int32_t d0 = vgetq_lane_s32(x0, 2); + const int64_t q0 = n0 / (d0 | 0x5); - ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; - idx0 = (~d) ^ q; + ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; + + idx0 = (~d0) ^ q0; cl = ((uint64_t*) &l1[idx1 & MASK])[0]; @@ -2560,13 +2980,14 @@ public: al1 ^= cl; idx1 = al1; - n = ((int64_t*)&l1[idx1 & MASK])[0]; - d = ((int32_t*)&l1[idx1 & MASK])[2]; - q = n / (d | 0x5); + const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); + const int64_t n1 = vgetq_lane_s64(x1, 0); + const int32_t d1 = vgetq_lane_s32(x1, 2); + const int64_t q1 = n1 / (d1 | 0x5); - ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; - idx1 = (~d) ^ q; + ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; + idx1 = (~d1) ^ q1; cl = ((uint64_t*) &l2[idx2 & MASK])[0]; ch = ((uint64_t*) &l2[idx2 & MASK])[1]; @@ -2582,12 +3003,14 @@ public: al2 ^= cl; idx2 = al2; - n = ((int64_t*)&l2[idx2 & MASK])[0]; - d = ((int32_t*)&l2[idx2 & MASK])[2]; - q = n / (d | 0x5); + const int64x2_t x2 = vld1q_s64(reinterpret_cast(&l2[idx2 & MASK])); + const int64_t n2 = vgetq_lane_s64(x2, 0); + const int32_t d2 = vgetq_lane_s32(x2, 2); + const int64_t q2 = n2 / (d2 | 0x5); - ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; - idx2 = (~d) ^ q; + ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2; + + idx2 = (~d2) ^ q2; } cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); @@ -2598,9 +3021,246 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + union alignas(16) + { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + __m128i cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + const __m128i& key0 = _mm_set_epi64x(ah0, al0); + + _mm_store_si128((__m128i*) k, key0); + cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*) x, cx0); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; + + cx0 = _mm_load_si128((__m128i*) k); + + const __m128i& key1 = _mm_set_epi64x(ah1, al1); + + _mm_store_si128((__m128i*) k, key1); + cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*) x, cx1); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; + + cx1 = _mm_load_si128((__m128i*) k); + + const __m128i& key2 = _mm_set_epi64x(ah2, al2); + + _mm_store_si128((__m128i*) k, key2); + cx2 = _mm_xor_si128(cx2, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*) x, cx2); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ + saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ + saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ + saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ + saes_table[3][BYTE(x[2], 3)]; + + cx2 = _mm_load_si128((__m128i*) k); + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0]; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + const int64x2_t x0 = vld1q_s64(reinterpret_cast(&l0[idx0 & MASK])); + const int64_t n0 = vgetq_lane_s64(x0, 0); + const int32_t d0 = vgetq_lane_s32(x0, 2); + const int64_t q0 = n0 / (d0 | 0x5); + + ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0; + + idx0 = d0 ^ q0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0]; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + const int64x2_t x1 = vld1q_s64(reinterpret_cast(&l1[idx1 & MASK])); + const int64_t n1 = vgetq_lane_s64(x1, 0); + const int32_t d1 = vgetq_lane_s32(x1, 2); + const int64_t q1 = n1 / (d1 | 0x5); + + ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1; + + idx1 = d1 ^ q1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; + + ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0]; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + const int64x2_t x2 = vld1q_s64(reinterpret_cast(&l2[idx2 & MASK])); + const int64_t n2 = vgetq_lane_s64(x2, 0); + const int32_t d2 = vgetq_lane_s32(x2, 2); + const int64_t q2 = n2 / (d2 | 0x5); + + ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2; + + idx2 = d2 ^ q2; + } +#undef BYTE + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } }; @@ -2610,22 +3270,22 @@ class CryptoNightMultiHash public: inline static void hash(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); - keccak(input + 3 * size, (int) size, ctx->state[3], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -2658,22 +3318,20 @@ public: __m128i cx3; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); } else { -# ifndef XMRIG_ARMv7 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); - cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -2763,231 +3421,39 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); } - inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) - { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); - keccak(input + 3 * size, (int) size, ctx->state[3], 200); - - uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); - uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); - - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); - - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); - cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t al3 = h3[0] ^h3[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; - uint64_t ah3 = h3[1] ^h3[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; - uint64_t idx3 = h3[0] ^h3[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; - __m128i cx3; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); - } else { -# ifndef XMRIG_ARMv7 - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); - cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); - - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); - cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); -# endif - } - - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); - _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); - - static const uint32_t table = 0x75310; - uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; - uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); - idx3 = EXTRACT64(cx3); - - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; - bx3 = cx3; - - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ah0 ^= tweak1_2_0; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah0 ^= tweak1_2_0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - - al1 += hi; - ah1 += lo; - - ah1 ^= tweak1_2_1; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - ah1 ^= tweak1_2_1; - - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; - - - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); - - al2 += hi; - ah2 += lo; - - ah2 ^= tweak1_2_2; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; - ah2 ^= tweak1_2_2; - - ah2 ^= ch; - al2 ^= cl; - idx2 = al2; - - - cl = ((uint64_t*) &l3[idx3 & MASK])[0]; - ch = ((uint64_t*) &l3[idx3 & MASK])[1]; - lo = __umul128(idx3, cl, &hi); - - al3 += hi; - ah3 += lo; - - ah3 ^= tweak1_2_3; - ((uint64_t*) &l3[idx3 & MASK])[0] = al3; - ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; - ah3 ^= tweak1_2_3; - - ah3 ^= ch; - al3 ^= cl; - idx3 = al3; - } - - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); - cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); - - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - keccakf(h3, 24); - - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); - } - - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); - keccak(input + 3 * size, (int) size, ctx->state[3], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); + *(reinterpret_cast(scratchPad[3]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -3020,22 +3486,20 @@ public: __m128i cx3; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); } else { -# ifndef XMRIG_ARMv7 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); - cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -3046,16 +3510,16 @@ public: static const uint32_t table = 0x75310; uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); @@ -3081,8 +3545,6 @@ public: ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; ah0 ^= tweak1_2_0; - ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0]; - ah0 ^= ch; al0 ^= cl; idx0 = al0; @@ -3100,8 +3562,6 @@ public: ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; ah1 ^= tweak1_2_1; - ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0]; - ah1 ^= ch; al1 ^= cl; idx1 = al1; @@ -3119,7 +3579,201 @@ public: ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; ah2 ^= tweak1_2_2; - ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0]; + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ah3 ^= tweak1_2_3; + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + ah3 ^= tweak1_2_3; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + } + + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); + uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ + *(reinterpret_cast(scratchPad[3]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0]; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0]; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; + + ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0]; ah2 ^= ch; al2 ^= cl; @@ -3138,7 +3792,7 @@ public: ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; ah3 ^= tweak1_2_3; - ((uint64_t*)&l3[idx3 & MASK])[1] ^= ((uint64_t*)&l3[idx3 & MASK])[0]; + ((uint64_t*) &l3[idx3 & MASK])[1] ^= ((uint64_t*) &l3[idx3 & MASK])[0]; ah3 ^= ch; al3 ^= cl; @@ -3155,24 +3809,32 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); } inline static void hashHeavy(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + // not supported + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } @@ -3180,29 +3842,29 @@ public: template class CryptoNightMultiHash -{ +{// public: inline static void hash(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); - keccak(input + 3 * size, (int) size, ctx->state[3], 200); - keccak(input + 4 * size, (int) size, ctx->state[4], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - const uint8_t* l4 = ctx->memory + 4 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); - uint64_t* h4 = reinterpret_cast(ctx->state[4]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -3241,25 +3903,23 @@ public: __m128i cx4; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); - cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); } else { -# ifndef XMRIG_ARMv7 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); - cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); - cx4 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx4, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah4, al4); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -3368,278 +4028,46 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); } - inline static void hashPowV2(const uint8_t* __restrict__ input, - size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) - { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); - keccak(input + 3 * size, (int) size, ctx->state[3], 200); - keccak(input + 4 * size, (int) size, ctx->state[4], 200); - - uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); - uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); - uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); - uint64_t tweak1_2_4 = (*reinterpret_cast(input + 35 + 4 * size) ^ - *(reinterpret_cast(ctx->state[4]) + 24)); - - - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - const uint8_t* l4 = ctx->memory + 4 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); - uint64_t* h4 = reinterpret_cast(ctx->state[4]); - - cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); - cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); - cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); - cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); - cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); - - uint64_t al0 = h0[0] ^h0[4]; - uint64_t al1 = h1[0] ^h1[4]; - uint64_t al2 = h2[0] ^h2[4]; - uint64_t al3 = h3[0] ^h3[4]; - uint64_t al4 = h4[0] ^h4[4]; - uint64_t ah0 = h0[1] ^h0[5]; - uint64_t ah1 = h1[1] ^h1[5]; - uint64_t ah2 = h2[1] ^h2[5]; - uint64_t ah3 = h3[1] ^h3[5]; - uint64_t ah4 = h4[1] ^h4[5]; - - __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); - __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); - __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); - __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); - __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); - - uint64_t idx0 = h0[0] ^h0[4]; - uint64_t idx1 = h1[0] ^h1[4]; - uint64_t idx2 = h2[0] ^h2[4]; - uint64_t idx3 = h3[0] ^h3[4]; - uint64_t idx4 = h4[0] ^h4[4]; - - for (size_t i = 0; i < ITERATIONS; i++) { - __m128i cx0; - __m128i cx1; - __m128i cx2; - __m128i cx3; - __m128i cx4; - - if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); - cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); - } else { -# ifndef XMRIG_ARMv7 - cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); - cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); - cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); - cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); - cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); - - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); - cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); - cx4 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx4, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah4, al4); -# endif - } - - _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); - _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); - _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); - _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); - _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4)); - - static const uint32_t table = 0x75310; - uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; - uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - tmp = reinterpret_cast(&l4[idx4 & MASK])[11]; - index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); - - idx0 = EXTRACT64(cx0); - idx1 = EXTRACT64(cx1); - idx2 = EXTRACT64(cx2); - idx3 = EXTRACT64(cx3); - idx4 = EXTRACT64(cx4); - - bx0 = cx0; - bx1 = cx1; - bx2 = cx2; - bx3 = cx3; - bx4 = cx4; - - uint64_t hi, lo, cl, ch; - cl = ((uint64_t*) &l0[idx0 & MASK])[0]; - ch = ((uint64_t*) &l0[idx0 & MASK])[1]; - lo = __umul128(idx0, cl, &hi); - - al0 += hi; - ah0 += lo; - - ah0 ^= tweak1_2_0; - ((uint64_t*) &l0[idx0 & MASK])[0] = al0; - ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; - ah0 ^= tweak1_2_0; - - ah0 ^= ch; - al0 ^= cl; - idx0 = al0; - - - cl = ((uint64_t*) &l1[idx1 & MASK])[0]; - ch = ((uint64_t*) &l1[idx1 & MASK])[1]; - lo = __umul128(idx1, cl, &hi); - - al1 += hi; - ah1 += lo; - - ah1 ^= tweak1_2_1; - ((uint64_t*) &l1[idx1 & MASK])[0] = al1; - ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; - ah1 ^= tweak1_2_1; - - ah1 ^= ch; - al1 ^= cl; - idx1 = al1; - - - cl = ((uint64_t*) &l2[idx2 & MASK])[0]; - ch = ((uint64_t*) &l2[idx2 & MASK])[1]; - lo = __umul128(idx2, cl, &hi); - - al2 += hi; - ah2 += lo; - - ah2 ^= tweak1_2_2; - ((uint64_t*) &l2[idx2 & MASK])[0] = al2; - ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; - ah2 ^= tweak1_2_2; - - ah2 ^= ch; - al2 ^= cl; - idx2 = al2; - - - cl = ((uint64_t*) &l3[idx3 & MASK])[0]; - ch = ((uint64_t*) &l3[idx3 & MASK])[1]; - lo = __umul128(idx3, cl, &hi); - - al3 += hi; - ah3 += lo; - - ah3 ^= tweak1_2_3; - ((uint64_t*) &l3[idx3 & MASK])[0] = al3; - ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; - ah3 ^= tweak1_2_3; - - ah3 ^= ch; - al3 ^= cl; - idx3 = al3; - - - cl = ((uint64_t*) &l4[idx4 & MASK])[0]; - ch = ((uint64_t*) &l4[idx4 & MASK])[1]; - lo = __umul128(idx4, cl, &hi); - - al4 += hi; - ah4 += lo; - - ah4 ^= tweak1_2_4; - ((uint64_t*) &l4[idx4 & MASK])[0] = al4; - ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; - ah4 ^= tweak1_2_4; - - ah4 ^= ch; - al4 ^= cl; - idx4 = al4; - } - - cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); - cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); - cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); - cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); - cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); - - keccakf(h0, 24); - keccakf(h1, 24); - keccakf(h2, 24); - keccakf(h3, 24); - keccakf(h4, 24); - - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); - } - - inline static void hashLiteIpbc (const uint8_t* __restrict__ input, + inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { - keccak(input, (int) size, ctx->state[0], 200); - keccak(input + size, (int) size, ctx->state[1], 200); - keccak(input + 2 * size, (int) size, ctx->state[2], 200); - keccak(input + 3 * size, (int) size, ctx->state[3], 200); - keccak(input + 4 * size, (int) size, ctx->state[4], 200); + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); + *(reinterpret_cast(scratchPad[3]->state) + 24)); uint64_t tweak1_2_4 = (*reinterpret_cast(input + 35 + 4 * size) ^ - *(reinterpret_cast(ctx->state[4]) + 24)); + *(reinterpret_cast(scratchPad[4]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - const uint8_t* l4 = ctx->memory + 4 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); - uint64_t* h4 = reinterpret_cast(ctx->state[4]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -3678,25 +4106,23 @@ public: __m128i cx4; if (SOFT_AES) { - cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); - cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); - cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); - cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); - cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); } else { -# ifndef XMRIG_ARMv7 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); - cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0); - cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1); - cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2); - cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3); - cx4 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx4, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah4, al4); -# endif + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4)); } _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); @@ -3708,19 +4134,19 @@ public: static const uint32_t table = 0x75310; uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); tmp = reinterpret_cast(&l4[idx4 & MASK])[11]; index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; - ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + ((uint8_t*) (&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); idx0 = EXTRACT64(cx0); idx1 = EXTRACT64(cx1); @@ -3747,8 +4173,6 @@ public: ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; ah0 ^= tweak1_2_0; - ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0]; - ah0 ^= ch; al0 ^= cl; idx0 = al0; @@ -3766,8 +4190,6 @@ public: ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; ah1 ^= tweak1_2_1; - ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0]; - ah1 ^= ch; al1 ^= cl; idx1 = al1; @@ -3785,8 +4207,6 @@ public: ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; ah2 ^= tweak1_2_2; - ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0]; - ah2 ^= ch; al2 ^= cl; idx2 = al2; @@ -3804,8 +4224,6 @@ public: ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; ah3 ^= tweak1_2_3; - ((uint64_t*)&l3[idx3 & MASK])[1] ^= ((uint64_t*)&l3[idx3 & MASK])[0]; - ah3 ^= ch; al3 ^= cl; idx3 = al3; @@ -3823,7 +4241,245 @@ public: ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; ah4 ^= tweak1_2_4; - ((uint64_t*)&l4[idx4 & MASK])[1] ^= ((uint64_t*)&l4[idx4 & MASK])[0]; + ah4 ^= ch; + al4 ^= cl; + idx4 = al4; + } + + cn_implode_scratchpad((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad((__m128i*) l2, (__m128i*) h2); + cn_implode_scratchpad((__m128i*) l3, (__m128i*) h3); + cn_implode_scratchpad((__m128i*) l4, (__m128i*) h4); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + keccakf(h3, 24); + keccakf(h4, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); + } + + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak(input, (int) size, scratchPad[0]->state, 200); + keccak(input + size, (int) size, scratchPad[1]->state, 200); + keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(input + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(input + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(input + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); + uint64_t tweak1_2_3 = (*reinterpret_cast(input + 35 + 3 * size) ^ + *(reinterpret_cast(scratchPad[3]->state) + 24)); + uint64_t tweak1_2_4 = (*reinterpret_cast(input + 35 + 4 * size) ^ + *(reinterpret_cast(scratchPad[4]->state) + 24)); + + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); + + cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad((__m128i*) h2, (__m128i*) l2); + cn_explode_scratchpad((__m128i*) h3, (__m128i*) l3); + cn_explode_scratchpad((__m128i*) h4, (__m128i*) l4); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t al3 = h3[0] ^h3[4]; + uint64_t al4 = h4[0] ^h4[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + uint64_t ah3 = h3[1] ^h3[5]; + uint64_t ah4 = h4[1] ^h4[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]); + __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + uint64_t idx3 = h3[0] ^h3[4]; + uint64_t idx4 = h4[0] ^h4[4]; + + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0; + __m128i cx1; + __m128i cx2; + __m128i cx3; + __m128i cx4; + + if (SOFT_AES) { + cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0)); + cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1)); + cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2)); + cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3)); + cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], _mm_set_epi64x(ah4, al4)); + } else { + cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]); + cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]); + + cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0)); + cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1)); + cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2)); + cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3)); + cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4)); + } + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3)); + _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l3[idx3 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l4[idx4 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*) (&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + idx3 = EXTRACT64(cx3); + idx4 = EXTRACT64(cx4); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + bx3 = cx3; + bx4 = cx4; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0]; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0]; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; + + ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0]; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + + cl = ((uint64_t*) &l3[idx3 & MASK])[0]; + ch = ((uint64_t*) &l3[idx3 & MASK])[1]; + lo = __umul128(idx3, cl, &hi); + + al3 += hi; + ah3 += lo; + + ah3 ^= tweak1_2_3; + ((uint64_t*) &l3[idx3 & MASK])[0] = al3; + ((uint64_t*) &l3[idx3 & MASK])[1] = ah3; + ah3 ^= tweak1_2_3; + + ((uint64_t*) &l3[idx3 & MASK])[1] ^= ((uint64_t*) &l3[idx3 & MASK])[0]; + + ah3 ^= ch; + al3 ^= cl; + idx3 = al3; + + + cl = ((uint64_t*) &l4[idx4 & MASK])[0]; + ch = ((uint64_t*) &l4[idx4 & MASK])[1]; + lo = __umul128(idx4, cl, &hi); + + al4 += hi; + ah4 += lo; + + ah4 ^= tweak1_2_4; + ((uint64_t*) &l4[idx4 & MASK])[0] = al4; + ((uint64_t*) &l4[idx4 & MASK])[1] = ah4; + ah4 ^= tweak1_2_4; + + ((uint64_t*) &l4[idx4 & MASK])[1] ^= ((uint64_t*) &l4[idx4 & MASK])[0]; ah4 ^= ch; al4 ^= cl; @@ -3842,25 +4498,33 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); } inline static void hashHeavy(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, size_t size, - uint8_t *__restrict__ output, - cryptonight_ctx* __restrict__ ctx) + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + // not supported + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h index 7a6f9cc3..0aee57b3 100644 --- a/src/crypto/CryptoNight_test.h +++ b/src/crypto/CryptoNight_test.h @@ -158,7 +158,7 @@ const static uint8_t test_output_heavy[160] = { }; // CN-Heavy Haven -const static uint8_t test_output_heavy_haven[160] = { +const static uint8_t test_output_heavy_haven[96] = { 0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57, 0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6, 0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F, @@ -167,4 +167,14 @@ const static uint8_t test_output_heavy_haven[160] = { 0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B }; +// CN-Heavy Tube +const static uint8_t test_output_heavy_tube[96] = { + 0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf, + 0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35, + 0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3, + 0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74, + 0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f, + 0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb +}; + #endif /* __CRYPTONIGHT_TEST_H__ */ diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h index bfb8a122..6ac2098d 100644 --- a/src/crypto/CryptoNight_x86.h +++ b/src/crypto/CryptoNight_x86.h @@ -475,7 +475,7 @@ public: inline static void hash(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -485,19 +485,18 @@ public: uint64_t idx[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; - bx[hashBlock] = - _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; } @@ -538,7 +537,7 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, output + hashBlock * 32); } } @@ -546,7 +545,7 @@ public: inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -557,14 +556,14 @@ public: uint64_t tweak1_2[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ - *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -619,15 +618,15 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, output + hashBlock * 32); } } - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + inline static void hashLiteTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -638,14 +637,14 @@ public: uint64_t tweak1_2[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ - *(reinterpret_cast(ctx->state[hashBlock]) + 24)); + *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -704,7 +703,7 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, output + hashBlock * 32); } } @@ -712,7 +711,7 @@ public: inline static void hashHeavy(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -722,12 +721,12 @@ public: uint64_t idx[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -781,7 +780,7 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, output + hashBlock * 32); } } @@ -789,7 +788,7 @@ public: inline static void hashHeavyHaven(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l[NUM_HASH_BLOCKS]; uint64_t* h[NUM_HASH_BLOCKS]; @@ -799,12 +798,12 @@ public: uint64_t idx[NUM_HASH_BLOCKS]; for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - keccak(static_cast(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200); + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); } for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { - l[hashBlock] = ctx->memory + hashBlock * MEM; - h[hashBlock] = reinterpret_cast(ctx->state[hashBlock]); + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); @@ -858,12 +857,122 @@ public: for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); keccakf(h[hashBlock], 24); - extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200, + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, output + hashBlock * 32); } } -}; + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l[NUM_HASH_BLOCKS]; + uint64_t* h[NUM_HASH_BLOCKS]; + uint64_t al[NUM_HASH_BLOCKS]; + uint64_t ah[NUM_HASH_BLOCKS]; + __m128i bx[NUM_HASH_BLOCKS]; + uint64_t idx[NUM_HASH_BLOCKS]; + uint64_t tweak1_2[NUM_HASH_BLOCKS]; + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + keccak(static_cast(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200); + tweak1_2[hashBlock] = (*reinterpret_cast(reinterpret_cast(input) + 35 + hashBlock * size) ^ + *(reinterpret_cast(scratchPad[hashBlock]->state) + 24)); + } + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + l[hashBlock] = scratchPad[hashBlock]->memory; + h[hashBlock] = reinterpret_cast(scratchPad[hashBlock]->state); + + cn_explode_scratchpad_heavy((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]); + + al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5]; + bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]); + idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4]; + } + + union alignas(16) { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + + for (size_t i = 0; i < ITERATIONS; i++) { + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + __m128i cx; + + cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]); + + const __m128i& key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]); + + _mm_store_si128((__m128i*)k, key); + cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*)x, cx); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + + cx = _mm_load_si128((__m128i*)k); + + _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx)); + + const uint8_t tmp = reinterpret_cast(&l[hashBlock][idx[hashBlock] & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx[hashBlock] = EXTRACT64(cx); + bx[hashBlock] = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0]; + ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1]; + lo = __umul128(idx[hashBlock], cl, &hi); + + al[hashBlock] += hi; + ah[hashBlock] += lo; + + ah[hashBlock] ^= tweak1_2[hashBlock]; + + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock]; + ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock]; + + ah[hashBlock] ^= tweak1_2[hashBlock]; + + ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; + + ah[hashBlock] ^= ch; + al[hashBlock] ^= cl; + idx[hashBlock] = al[hashBlock]; + + int64_t n = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0]; + int32_t d = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q; + idx[hashBlock] = d ^ q; + } + } + +#undef BYTE + + for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) { + cn_implode_scratchpad_heavy((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]); + keccakf(h[hashBlock], 24); + extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200, + output + hashBlock * 32); + } + } +}; template class CryptoNightMultiHash @@ -872,7 +981,7 @@ public: inline static void hash(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -881,10 +990,10 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad((__m128i*) h, (__m128i*) l); @@ -925,13 +1034,13 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -940,12 +1049,12 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); uint64_t tweak1_2 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + *(reinterpret_cast(scratchPad[0]->state) + 24)); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad((__m128i*) h, (__m128i*) l); @@ -969,6 +1078,7 @@ public: static const uint32_t table = 0x75310; const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + idx = EXTRACT64(cx); bx = cx; @@ -992,13 +1102,13 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -1007,12 +1117,12 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); uint64_t tweak1_2 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + *(reinterpret_cast(scratchPad[0]->state) + 24)); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad((__m128i*) h, (__m128i*) l); @@ -1061,13 +1171,13 @@ public: cn_implode_scratchpad((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } inline static void hashHeavy(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -1076,10 +1186,10 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); @@ -1127,13 +1237,13 @@ public: cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { const uint8_t* l; uint64_t* h; @@ -1142,10 +1252,10 @@ public: __m128i bx; uint64_t idx; - keccak(static_cast(input), (int) size, ctx->state[0], 200); + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); - l = ctx->memory; - h = reinterpret_cast(ctx->state[0]); + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); @@ -1193,7 +1303,102 @@ public: cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); keccakf(h, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + const uint8_t* l; + uint64_t* h; + uint64_t al; + uint64_t ah; + __m128i bx; + uint64_t idx; + + keccak(static_cast(input), (int) size, scratchPad[0]->state, 200); + + uint64_t tweak1_2 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + + l = scratchPad[0]->memory; + h = reinterpret_cast(scratchPad[0]->state); + + cn_explode_scratchpad_heavy((__m128i*) h, (__m128i*) l); + + al = h[0] ^ h[4]; + ah = h[1] ^ h[5]; + bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]); + idx = h[0] ^ h[4]; + + union alignas(16) { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx = _mm_load_si128((__m128i*) &l[idx & MASK]); + + const __m128i& key = _mm_set_epi64x(ah, al); + + _mm_store_si128((__m128i*)k, key); + cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*)x, cx); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + + cx = _mm_load_si128((__m128i*)k); + + _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx)); + const uint8_t tmp = reinterpret_cast(&l[idx & MASK])[11]; + static const uint32_t table = 0x75310; + const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx = EXTRACT64(cx); + bx = cx; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l[idx & MASK])[0]; + ch = ((uint64_t*) &l[idx & MASK])[1]; + lo = __umul128(idx, cl, &hi); + + al += hi; + ah += lo; + + ah ^= tweak1_2; + ((uint64_t*) &l[idx & MASK])[0] = al; + ((uint64_t*) &l[idx & MASK])[1] = ah; + ah ^= tweak1_2; + + ((uint64_t*)&l[idx & MASK])[1] ^= ((uint64_t*)&l[idx & MASK])[0]; + + ah ^= ch; + al ^= cl; + idx = al; + + int64_t n = ((int64_t*)&l[idx & MASK])[0]; + int32_t d = ((int32_t*)&l[idx & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l[idx & MASK])[0] = n ^ q; + idx = d ^ q; + } +#undef BYTE + + cn_implode_scratchpad_heavy((__m128i*) l, (__m128i*) h); + keccakf(h, 24); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); } }; @@ -1204,15 +1409,15 @@ public: inline static void hash(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -1288,27 +1493,27 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -1396,27 +1601,27 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -1508,22 +1713,22 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } inline static void hashHeavy(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -1614,22 +1819,22 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -1720,8 +1925,164 @@ public: keccakf(h0, 24); keccakf(h1, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + + union alignas(16) { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + + const __m128i& key0 = _mm_set_epi64x(ah0, al0); + + _mm_store_si128((__m128i*)k, key0); + cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*)x, cx0); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + + cx0 = _mm_load_si128((__m128i*)k); + + const __m128i& key1 = _mm_set_epi64x(ah1, al1); + + _mm_store_si128((__m128i*)k, key1); + cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*)x, cx1); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + + cx1 = _mm_load_si128((__m128i*)k); + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + + bx0 = cx0; + bx1 = cx1; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0]; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0]; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + } +#undef BYTE + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + + keccakf(h0, 24); + keccakf(h1, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); } }; @@ -1732,18 +2093,18 @@ public: inline static void hash(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -1850,33 +2211,33 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -2000,33 +2361,33 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -2156,26 +2517,26 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } inline static void hashHeavy(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -2303,26 +2664,26 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); @@ -2450,9 +2811,227 @@ public: keccakf(h1, 24); keccakf(h2, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + + uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ + *(reinterpret_cast(scratchPad[0]->state) + 24)); + uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ + *(reinterpret_cast(scratchPad[1]->state) + 24)); + uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ + *(reinterpret_cast(scratchPad[2]->state) + 24)); + + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + + cn_explode_scratchpad_heavy((__m128i*) h0, (__m128i*) l0); + cn_explode_scratchpad_heavy((__m128i*) h1, (__m128i*) l1); + cn_explode_scratchpad_heavy((__m128i*) h2, (__m128i*) l2); + + uint64_t al0 = h0[0] ^h0[4]; + uint64_t al1 = h1[0] ^h1[4]; + uint64_t al2 = h2[0] ^h2[4]; + uint64_t ah0 = h0[1] ^h0[5]; + uint64_t ah1 = h1[1] ^h1[5]; + uint64_t ah2 = h2[1] ^h2[5]; + + __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]); + __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]); + __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]); + + uint64_t idx0 = h0[0] ^h0[4]; + uint64_t idx1 = h1[0] ^h1[4]; + uint64_t idx2 = h2[0] ^h2[4]; + + union alignas(16) { + uint32_t k[4]; + uint64_t v64[2]; + }; + alignas(16) uint32_t x[4]; + +#define BYTE(p, i) ((unsigned char*)&p)[i] + for (size_t i = 0; i < ITERATIONS; i++) { + __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]); + __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]); + __m128i cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]); + + const __m128i& key0 = _mm_set_epi64x(ah0, al0); + + _mm_store_si128((__m128i*)k, key0); + cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*)x, cx0); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + + cx0 = _mm_load_si128((__m128i*)k); + + const __m128i& key1 = _mm_set_epi64x(ah1, al1); + + _mm_store_si128((__m128i*)k, key1); + cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*)x, cx1); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + + cx1 = _mm_load_si128((__m128i*)k); + + const __m128i& key2 = _mm_set_epi64x(ah2, al2); + + _mm_store_si128((__m128i*)k, key2); + cx2 = _mm_xor_si128(cx2, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())); + _mm_store_si128((__m128i*)x, cx2); + + k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)]; + x[0] ^= k[0]; + k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)]; + x[1] ^= k[1]; + k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)]; + x[2] ^= k[2]; + k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)]; + + cx2 = _mm_load_si128((__m128i*)k); + + _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0)); + _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1)); + _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2)); + + static const uint32_t table = 0x75310; + uint8_t tmp = reinterpret_cast(&l0[idx0 & MASK])[11]; + uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l1[idx1 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + tmp = reinterpret_cast(&l2[idx2 & MASK])[11]; + index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1; + ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30); + + idx0 = EXTRACT64(cx0); + idx1 = EXTRACT64(cx1); + idx2 = EXTRACT64(cx2); + + bx0 = cx0; + bx1 = cx1; + bx2 = cx2; + + uint64_t hi, lo, cl, ch; + cl = ((uint64_t*) &l0[idx0 & MASK])[0]; + ch = ((uint64_t*) &l0[idx0 & MASK])[1]; + lo = __umul128(idx0, cl, &hi); + + al0 += hi; + ah0 += lo; + + ah0 ^= tweak1_2_0; + ((uint64_t*) &l0[idx0 & MASK])[0] = al0; + ((uint64_t*) &l0[idx0 & MASK])[1] = ah0; + ah0 ^= tweak1_2_0; + + ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0]; + + ah0 ^= ch; + al0 ^= cl; + idx0 = al0; + + int64_t n = ((int64_t*)&l0[idx0 & MASK])[0]; + int32_t d = ((int32_t*)&l0[idx0 & MASK])[2]; + int64_t q = n / (d | 0x5); + + ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q; + idx0 = d ^ q; + + + cl = ((uint64_t*) &l1[idx1 & MASK])[0]; + ch = ((uint64_t*) &l1[idx1 & MASK])[1]; + lo = __umul128(idx1, cl, &hi); + + al1 += hi; + ah1 += lo; + + ah1 ^= tweak1_2_1; + ((uint64_t*) &l1[idx1 & MASK])[0] = al1; + ((uint64_t*) &l1[idx1 & MASK])[1] = ah1; + ah1 ^= tweak1_2_1; + + ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0]; + + ah1 ^= ch; + al1 ^= cl; + idx1 = al1; + + n = ((int64_t*)&l1[idx1 & MASK])[0]; + d = ((int32_t*)&l1[idx1 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q; + idx1 = d ^ q; + + + cl = ((uint64_t*) &l2[idx2 & MASK])[0]; + ch = ((uint64_t*) &l2[idx2 & MASK])[1]; + lo = __umul128(idx2, cl, &hi); + + al2 += hi; + ah2 += lo; + + ah2 ^= tweak1_2_2; + ((uint64_t*) &l2[idx2 & MASK])[0] = al2; + ((uint64_t*) &l2[idx2 & MASK])[1] = ah2; + ah2 ^= tweak1_2_2; + + ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0]; + + ah2 ^= ch; + al2 ^= cl; + idx2 = al2; + + n = ((int64_t*)&l2[idx2 & MASK])[0]; + d = ((int32_t*)&l2[idx2 & MASK])[2]; + q = n / (d | 0x5); + + ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q; + idx2 = d ^ q; + } +#undef BYTE + + cn_implode_scratchpad_heavy((__m128i*) l0, (__m128i*) h0); + cn_implode_scratchpad_heavy((__m128i*) l1, (__m128i*) h1); + cn_implode_scratchpad_heavy((__m128i*) l2, (__m128i*) h2); + + keccakf(h0, 24); + keccakf(h1, 24); + keccakf(h2, 24); + + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); } }; @@ -2463,21 +3042,21 @@ public: inline static void hash(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -2613,39 +3192,39 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); } inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); + *(reinterpret_cast(scratchPad[3]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -2803,39 +3382,39 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); } - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); + *(reinterpret_cast(scratchPad[3]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -3001,24 +3580,32 @@ public: keccakf(h2, 24); keccakf(h3, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); } inline static void hashHeavy(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + // not supported + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } @@ -3031,24 +3618,24 @@ public: inline static void hash(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); - keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - const uint8_t* l4 = ctx->memory + 4 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); - uint64_t* h4 = reinterpret_cast(ctx->state[4]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -3212,46 +3799,46 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); } inline static void hashPowV2(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); - keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); + *(reinterpret_cast(scratchPad[3]->state) + 24)); uint64_t tweak1_2_4 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 4 * size) ^ - *(reinterpret_cast(ctx->state[4]) + 24)); + *(reinterpret_cast(scratchPad[4]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - const uint8_t* l4 = ctx->memory + 4 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); - uint64_t* h4 = reinterpret_cast(ctx->state[4]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -3442,46 +4029,46 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); } - inline static void hashLiteIpbc(const uint8_t* __restrict__ input, + inline static void hashLiteTube(const uint8_t* __restrict__ input, size_t size, uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + ScratchPad** __restrict__ scratchPad) { - keccak((const uint8_t*) input, (int) size, ctx->state[0], 200); - keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200); - keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200); - keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200); - keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200); + keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200); + keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200); + keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200); + keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200); + keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200); uint64_t tweak1_2_0 = (*reinterpret_cast(reinterpret_cast(input) + 35) ^ - *(reinterpret_cast(ctx->state[0]) + 24)); + *(reinterpret_cast(scratchPad[0]->state) + 24)); uint64_t tweak1_2_1 = (*reinterpret_cast(reinterpret_cast(input) + 35 + size) ^ - *(reinterpret_cast(ctx->state[1]) + 24)); + *(reinterpret_cast(scratchPad[1]->state) + 24)); uint64_t tweak1_2_2 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 2 * size) ^ - *(reinterpret_cast(ctx->state[2]) + 24)); + *(reinterpret_cast(scratchPad[2]->state) + 24)); uint64_t tweak1_2_3 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 3 * size) ^ - *(reinterpret_cast(ctx->state[3]) + 24)); + *(reinterpret_cast(scratchPad[3]->state) + 24)); uint64_t tweak1_2_4 = (*reinterpret_cast(reinterpret_cast(input) + 35 + 4 * size) ^ - *(reinterpret_cast(ctx->state[4]) + 24)); + *(reinterpret_cast(scratchPad[4]->state) + 24)); - const uint8_t* l0 = ctx->memory; - const uint8_t* l1 = ctx->memory + MEM; - const uint8_t* l2 = ctx->memory + 2 * MEM; - const uint8_t* l3 = ctx->memory + 3 * MEM; - const uint8_t* l4 = ctx->memory + 4 * MEM; - uint64_t* h0 = reinterpret_cast(ctx->state[0]); - uint64_t* h1 = reinterpret_cast(ctx->state[1]); - uint64_t* h2 = reinterpret_cast(ctx->state[2]); - uint64_t* h3 = reinterpret_cast(ctx->state[3]); - uint64_t* h4 = reinterpret_cast(ctx->state[4]); + const uint8_t* l0 = scratchPad[0]->memory; + const uint8_t* l1 = scratchPad[1]->memory; + const uint8_t* l2 = scratchPad[2]->memory; + const uint8_t* l3 = scratchPad[3]->memory; + const uint8_t* l4 = scratchPad[4]->memory; + uint64_t* h0 = reinterpret_cast(scratchPad[0]->state); + uint64_t* h1 = reinterpret_cast(scratchPad[1]->state); + uint64_t* h2 = reinterpret_cast(scratchPad[2]->state); + uint64_t* h3 = reinterpret_cast(scratchPad[3]->state); + uint64_t* h4 = reinterpret_cast(scratchPad[4]->state); cn_explode_scratchpad((__m128i*) h0, (__m128i*) l0); cn_explode_scratchpad((__m128i*) h1, (__m128i*) l1); @@ -3682,27 +4269,36 @@ public: keccakf(h3, 24); keccakf(h4, 24); - extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output); - extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32); - extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64); - extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96); - extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128); + extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output); + extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32); + extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64); + extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96); + extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128); } inline static void hashHeavy(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } inline static void hashHeavyHaven(const uint8_t* __restrict__ input, - size_t size, - uint8_t* __restrict__ output, - cryptonight_ctx* __restrict__ ctx) + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) + { + // not supported + } + + inline static void hashHeavyTube(const uint8_t* __restrict__ input, + size_t size, + uint8_t* __restrict__ output, + ScratchPad** __restrict__ scratchPad) { // not supported } }; + #endif /* __CRYPTONIGHT_X86_H__ */ diff --git a/src/crypto/SSE2NEON.h b/src/crypto/SSE2NEON.h index 6a00448d..0b8413fc 100644 --- a/src/crypto/SSE2NEON.h +++ b/src/crypto/SSE2NEON.h @@ -1189,6 +1189,12 @@ FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } +// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx +FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) +{ + return vreinterpretq_m128i_u32(vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); +} + // Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx // see also: // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean diff --git a/src/crypto/soft_aes.h b/src/crypto/soft_aes.h index 0703f98d..20c67c09 100644 --- a/src/crypto/soft_aes.h +++ b/src/crypto/soft_aes.h @@ -105,12 +105,29 @@ static inline __m128i soft_aesenc(const uint32_t* in, __m128i key) return _mm_xor_si128(out, key); } +static inline __m128i soft_aesenc(__m128i in, __m128i key) +{ + uint32_t x0, x1, x2, x3; + x0 = _mm_cvtsi128_si32(in); + x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55)); + x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA)); + x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF)); + + __m128i out = _mm_set_epi32( + (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]), + (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]), + (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]), + (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24])); + + return _mm_xor_si128(out, key); +} + static inline uint32_t sub_word(uint32_t key) { - return (saes_sbox[key >> 24 ] << 24) | - (saes_sbox[(key >> 16) & 0xff] << 16 ) | - (saes_sbox[(key >> 8) & 0xff] << 8 ) | - saes_sbox[key & 0xff]; + return (saes_sbox[key >> 24 ] << 24) | + (saes_sbox[(key >> 16) & 0xff] << 16 ) | + (saes_sbox[(key >> 8) & 0xff] << 8 ) | + saes_sbox[key & 0xff]; } #if defined(__clang__) || defined(XMRIG_ARM) diff --git a/src/log/Log.h b/src/log/Log.h index ea0fe1b3..53564394 100644 --- a/src/log/Log.h +++ b/src/log/Log.h @@ -73,6 +73,19 @@ private: }; +#define RED_BOLD(x) "\x1B[1;31m" x "\x1B[0m" +#define RED(x) "\x1B[0;31m" x "\x1B[0m" +#define GREEN_BOLD(x) "\x1B[1;32m" x "\x1B[0m" +#define GREEN(x) "\x1B[0;32m" x "\x1B[0m" +#define MAGENTA_BOLD(x) "\x1B[1;35m" x "\x1B[0m" +#define MAGENTA(x) "\x1B[0;35m" x "\x1B[0m" +#define CYAN_BOLD(x) "\x1B[1;36m" x "\x1B[0m" +#define CYAN(x) "\x1B[0;36m" x "\x1B[0m" +#define WHITE_BOLD(x) "\x1B[1;37m" x "\x1B[0m" +#define WHITE(x) "\x1B[0;37m" x "\x1B[0m" +#define YELLOW_BOLD(x) "\x1B[1;33m" x "\x1B[0m" +#define YELLOW(x) "\x1B[0;33m" x "\x1B[0m" + #define LOG_ERR(x, ...) Log::i()->message(Log::ERR, x, ##__VA_ARGS__) #define LOG_WARN(x, ...) Log::i()->message(Log::WARNING, x, ##__VA_ARGS__) #define LOG_NOTICE(x, ...) Log::i()->message(Log::NOTICE, x, ##__VA_ARGS__) diff --git a/src/version.h b/src/version.h index 33cfdd7b..776d2767 100644 --- a/src/version.h +++ b/src/version.h @@ -36,14 +36,14 @@ #define APP_DESC "XMRigCC CPU miner" #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id" #endif -#define APP_VERSION "1.6.4 (based on XMRig)" +#define APP_VERSION "1.6.5_beta1 (based on XMRig)" #define APP_DOMAIN "" #define APP_SITE "https://github.com/Bendr0id/xmrigCC" #define APP_KIND "cpu" #define APP_VER_MAJOR 1 #define APP_VER_MINOR 6 -#define APP_VER_BUILD 4 +#define APP_VER_BUILD 5 #define APP_VER_REV 0 #ifndef NDEBUG diff --git a/src/workers/Handle.cpp b/src/workers/Handle.cpp index c461cee7..89906a36 100644 --- a/src/workers/Handle.cpp +++ b/src/workers/Handle.cpp @@ -25,7 +25,7 @@ #include "workers/Handle.h" -Handle::Handle(int threadId, int threads, int64_t affinity, int priority) : +Handle::Handle(size_t threadId, size_t threads, int64_t affinity, int priority) : m_priority(priority), m_threadId(threadId), m_threads(threads), diff --git a/src/workers/Handle.h b/src/workers/Handle.h index 9faae0d0..21506faf 100644 --- a/src/workers/Handle.h +++ b/src/workers/Handle.h @@ -35,21 +35,21 @@ class IWorker; class Handle { public: - Handle(int threadId, int threads, int64_t affinity, int priority); + Handle(size_t threadId, size_t threads, int64_t affinity, int priority); void join(); void start(void (*callback) (void *)); inline int priority() const { return m_priority; } - inline int threadId() const { return m_threadId; } - inline int threads() const { return m_threads; } + inline size_t threadId() const { return m_threadId; } + inline size_t threads() const { return m_threads; } inline int64_t affinity() const { return m_affinity; } inline IWorker *worker() const { return m_worker; } inline void setWorker(IWorker *worker) { m_worker = worker; } private: int m_priority; - int m_threadId; - int m_threads; + size_t m_threadId; + size_t m_threads; int64_t m_affinity; IWorker *m_worker; uv_thread_t m_thread; diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp index d1d16ad6..e599b87f 100644 --- a/src/workers/MultiWorker.cpp +++ b/src/workers/MultiWorker.cpp @@ -24,6 +24,7 @@ #include +#include #include "crypto/CryptoNight.h" @@ -35,7 +36,7 @@ class MultiWorker : public Worker { public: - explicit MultiWorker(Handle *handle, size_t hashMultiplier); + explicit MultiWorker(Handle *handle, size_t hashFactor); ~MultiWorker(); void start() override; @@ -50,7 +51,10 @@ private: uint8_t* m_hash; State *m_state; State *m_pausedState; - size_t m_hashMultiplier; + size_t m_hashFactor; + + ScratchPadMem scratchPadMem; + ScratchPad* scratchPads[MAX_NUM_HASH_BLOCKS]; }; class MultiWorker::State @@ -77,13 +81,14 @@ public: }; -MultiWorker::MultiWorker(Handle *handle, size_t hashMultiplier) +MultiWorker::MultiWorker(Handle *handle, size_t hashFactor) : Worker(handle), - m_hash(new uint8_t[32 * hashMultiplier]), - m_state(new MultiWorker::State(hashMultiplier)), - m_pausedState(new MultiWorker::State(hashMultiplier)), - m_hashMultiplier(hashMultiplier) + m_hash(new uint8_t[32 * hashFactor]), + m_state(new MultiWorker::State(hashFactor)), + m_pausedState(new MultiWorker::State(hashFactor)), + m_hashFactor(hashFactor) { + scratchPadMem = Mem::create(scratchPads, m_id); } MultiWorker::~MultiWorker() @@ -91,10 +96,25 @@ MultiWorker::~MultiWorker() delete[] m_hash; delete m_state; delete m_pausedState; + + Mem::release(scratchPads, scratchPadMem, m_id); } void MultiWorker::start() { + const size_t memory = scratchPadMem.realSize / 1048576; + + if (Options::i()->colors()) { + LOG_INFO(WHITE_BOLD("Starting thread ") GREEN_BOLD("%zu/%zu") " affined to core: " GREEN_BOLD("#%d") " -> huge pages:" GREEN_BOLD(" %s%zu/%zu") " scratchpad: " CYAN_BOLD("%zu.0 MB"), + m_id+1, Options::i()->threads(), m_affinedCpu, + (scratchPadMem.hugePages == scratchPadMem.pages ? "\x1B[1;32m" : (scratchPadMem.hugePages == 0 ? "\x1B[1;31m" : "\x1B[1;33m")), + scratchPadMem.hugePages, scratchPadMem.pages, memory); + } + else { + LOG_INFO("Starting thread %zu/%zu affined to core: #%d -> huge pages: %zu/%zu scratchpad: %zu.0 MB", + m_id+1, Options::i()->threads(), m_affinedCpu, scratchPadMem.hugePages, scratchPadMem.pages, memory); + } + while (Workers::sequence() > 0) { if (Workers::isPaused()) { do { @@ -114,15 +134,15 @@ void MultiWorker::start() storeStats(); } - m_count += m_hashMultiplier; + m_count += m_hashFactor; - for (size_t i=0; i < m_hashMultiplier; ++i) { + for (size_t i=0; i < m_hashFactor; ++i) { *Job::nonce(m_state->blob + i * m_state->job.size()) = ++m_state->nonces[i]; } - CryptoNight::hash(m_hashMultiplier, m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, m_ctx); + CryptoNight::hash(m_hashFactor, m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads); - for (size_t i=0; i < m_hashMultiplier; ++i) { + for (size_t i=0; i < m_hashFactor; ++i) { if (*reinterpret_cast(m_hash + 24 + i * 32) < m_state->job.target()) { Workers::submit(JobResult(m_state->job.poolId(), m_state->job.id(), m_state->nonces[i], m_hash + i * 32, m_state->job.diff()), m_id); @@ -162,7 +182,7 @@ void MultiWorker::consumeJob() m_state->job = std::move(job); - for (size_t i=0; i < m_hashMultiplier; ++i) { + for (size_t i=0; i < m_hashFactor; ++i) { memcpy(m_state->blob + i * m_state->job.size(), m_state->job.blob(), m_state->job.size()); if (m_state->job.isNicehash()) { m_state->nonces[i] = (*Job::nonce(m_state->blob + i * m_state->job.size()) & 0xff000000U) + @@ -183,6 +203,6 @@ void MultiWorker::save(const Job &job) } } -Worker* createMultiWorker(size_t numHashes, Handle *handle) { - return new MultiWorker(handle, numHashes); +Worker* createMultiWorker(Handle *handle, size_t hashFactor) { + return new MultiWorker(handle, hashFactor); } \ No newline at end of file diff --git a/src/workers/MultiWorker.h b/src/workers/MultiWorker.h index 33f5a062..14b3d13d 100644 --- a/src/workers/MultiWorker.h +++ b/src/workers/MultiWorker.h @@ -33,7 +33,7 @@ class Handle; -Worker* createMultiWorker(size_t numHashes, Handle *handle); +Worker* createMultiWorker(Handle *handle, size_t hashFactor); #endif /* __SINGLEWORKER_H__ */ diff --git a/src/workers/Worker.cpp b/src/workers/Worker.cpp index 02646ced..b5a84f8b 100644 --- a/src/workers/Worker.cpp +++ b/src/workers/Worker.cpp @@ -39,12 +39,11 @@ Worker::Worker(Handle *handle) : m_count(0), m_sequence(0) { - if (Cpu::threads() > 1 && handle->affinity() != -1L) { - Cpu::setAffinity(m_id, handle->affinity()); + if (m_threads > 1 && m_threads <= Cpu::threads()) { + m_affinedCpu = Cpu::setThreadAffinity(m_id, handle->affinity()); } Platform::setThreadPriority(handle->priority()); - m_ctx = Mem::create(m_id); } diff --git a/src/workers/Worker.h b/src/workers/Worker.h index 11c4a198..9abf2ec3 100644 --- a/src/workers/Worker.h +++ b/src/workers/Worker.h @@ -32,7 +32,7 @@ #include "interfaces/IWorker.h" -struct cryptonight_ctx; +struct ScratchPad; class Handle; @@ -48,9 +48,9 @@ public: protected: void storeStats(); - cryptonight_ctx *m_ctx; int m_id; - int m_threads; + int m_affinedCpu; + size_t m_threads; std::atomic m_hashCount; std::atomic m_timestamp; uint64_t m_count; diff --git a/src/workers/Workers.cpp b/src/workers/Workers.cpp index e225bcc5..ef786842 100644 --- a/src/workers/Workers.cpp +++ b/src/workers/Workers.cpp @@ -100,9 +100,8 @@ void Workers::setJob(const Job &job) } -void Workers::start(int64_t affinity, int priority) +void Workers::start(size_t threads, int64_t affinityMask, int priority) { - const int threads = Mem::threads(); m_hashrate = new Hashrate(threads); uv_mutex_init(&m_mutex); @@ -115,8 +114,8 @@ void Workers::start(int64_t affinity, int priority) uv_timer_init(uv_default_loop(), &m_timer); uv_timer_start(&m_timer, Workers::onTick, 500, 500); - for (int i = 0; i < threads; ++i) { - auto handle = new Handle(i, threads, affinity, priority); + for (size_t i = 0; i < threads; ++i) { + auto handle = new Handle(i, threads, affinityMask, priority); m_workers.push_back(handle); handle->start(Workers::onReady); } @@ -151,7 +150,7 @@ void Workers::submit(const JobResult &result, int threadId) void Workers::onReady(void *arg) { auto handle = static_cast(arg); - handle->setWorker(createMultiWorker(Mem::getThreadHashFactor(handle->threadId()), handle)); + handle->setWorker(createMultiWorker(handle, Mem::getThreadHashFactor(handle->threadId()))); handle->worker()->start(); } diff --git a/src/workers/Workers.h b/src/workers/Workers.h index 22a2b376..c21f5564 100644 --- a/src/workers/Workers.h +++ b/src/workers/Workers.h @@ -46,7 +46,7 @@ public: static void printHashrate(bool detail); static void setEnabled(bool enabled); static void setJob(const Job &job); - static void start(int64_t affinity, int priority); + static void start(size_t threads, int64_t affinityMask, int priority); static void stop(); static void submit(const JobResult &result, int threadId); diff --git a/test/cryptonight/cryptonight.c b/test/cryptonight/cryptonight.c index bcc0db30..dd0ec615 100644 --- a/test/cryptonight/cryptonight.c +++ b/test/cryptonight/cryptonight.c @@ -26,10 +26,10 @@ const static char input2[] = "This is a test"; const static char input3[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus pellentesque metus."; -void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); -void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); -void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); -void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); +void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct ScratchPad* ctx); +void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct ScratchPad* ctx); +void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct ScratchPad* ctx); +void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct ScratchPad* ctx); static char hash[64]; @@ -55,21 +55,21 @@ static char *bin2hex(const unsigned char *p, size_t len) static void * create_ctx(int ratio) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16); + struct ScratchPad *ctx = (struct ScratchPad*) _mm_malloc(sizeof(struct ScratchPad), 16); ctx->memory = (uint8_t *) _mm_malloc(MEMORY * ratio, 16); return ctx; } -static void free_ctx(struct cryptonight_ctx *ctx) { +static void free_ctx(struct ScratchPad *ctx) { _mm_free(ctx->memory); _mm_free(ctx); } void test_cryptonight_av1_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1); cryptonight_av1_aesni(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32)); @@ -86,7 +86,7 @@ void test_cryptonight_av1_should_CalcHash(void) { void test_cryptonight_av2_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2); cryptonight_av2_aesni_double(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64)); @@ -97,7 +97,7 @@ void test_cryptonight_av2_should_CalcHash(void) void test_cryptonight_av3_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1); cryptonight_av3_softaes(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32)); @@ -114,7 +114,7 @@ void test_cryptonight_av3_should_CalcHash(void) void test_cryptonight_av4_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2); cryptonight_av4_softaes_double(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64)); diff --git a/test/cryptonight_lite/cryptonight_lite.c b/test/cryptonight_lite/cryptonight_lite.c index a6d5b554..61319da2 100644 --- a/test/cryptonight_lite/cryptonight_lite.c +++ b/test/cryptonight_lite/cryptonight_lite.c @@ -24,15 +24,15 @@ const static char input1[152] = { }; -void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx) {} -void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx) {} -void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx) {} -void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx) {} +void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct ScratchPad* ctx) {} +void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct ScratchPad* ctx) {} +void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct ScratchPad* ctx) {} +void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct ScratchPad* ctx) {} -void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); -void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); -void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); -void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx); +void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct ScratchPad* ctx); +void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct ScratchPad* ctx); +void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct ScratchPad* ctx); +void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct ScratchPad* ctx); static char hash[64]; @@ -56,21 +56,21 @@ static char *bin2hex(const unsigned char *p, size_t len) static void * create_ctx(int ratio) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16); + struct ScratchPad *ctx = (struct ScratchPad*) _mm_malloc(sizeof(struct ScratchPad), 16); ctx->memory = (uint8_t *) _mm_malloc(MEMORY_LITE * ratio, 16); return ctx; } -static void free_ctx(struct cryptonight_ctx *ctx) { +static void free_ctx(struct ScratchPad *ctx) { _mm_free(ctx->memory); _mm_free(ctx); } void test_cryptonight_lite_av1_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1); cryptonight_lite_av1_aesni(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32)); @@ -81,7 +81,7 @@ void test_cryptonight_lite_av1_should_CalcHash(void) { void test_cryptonight_lite_av2_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2); cryptonight_lite_av2_aesni_double(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64)); @@ -91,7 +91,7 @@ void test_cryptonight_lite_av2_should_CalcHash(void) void test_cryptonight_lite_av3_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1); cryptonight_lite_av3_softaes(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32)); @@ -102,7 +102,7 @@ void test_cryptonight_lite_av3_should_CalcHash(void) { void test_cryptonight_lite_av4_should_CalcHash(void) { - struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2); + struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2); cryptonight_lite_av4_softaes_double(input1, 76, &hash, ctx); TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));