From 90699d58ec347bffdcfed439dfbeea2b3d8ddbce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ben=20Gr=C3=A4f?= <ben@gra3f.de>
Date: Tue, 26 Jun 2018 20:25:38 +0200
Subject: [PATCH] Features of 1.6.5 (#140)

* Hashrate improve -> add autodetection mode for cpu-affinity
* Hashrate improve, more stable hashrates -> refactor memory allocation
* Add TubeV4 support (cn-heavy + ipbc mod + soft-aes mod)
* Update ccp-httpd lib to fix stop/freeze of cc communication on some miners
* Fix cn-heavy on arm processors
---
 src/3rdparty/cpp-httplib/httplib.h       | 3164 +++++++++++++-------
 src/App.cpp                              |   25 +-
 src/App_unix.cpp                         |    4 -
 src/App_win.cpp                          |    4 -
 src/Cpu.cpp                              |   25 +-
 src/Cpu.h                                |    3 +-
 src/CpuImpl.h                            |    3 +-
 src/Cpu_mac.cpp                          |   24 +-
 src/Cpu_unix.cpp                         |   44 +-
 src/Cpu_win.cpp                          |   38 +-
 src/Mem.cpp                              |   49 +-
 src/Mem.h                                |   47 +-
 src/Mem_unix.cpp                         |   83 +-
 src/Mem_win.cpp                          |   87 +-
 src/Options.cpp                          |    8 +-
 src/PowVariant.h                         |   10 +-
 src/Summary.cpp                          |   20 +-
 src/api/ApiState.cpp                     |    2 +-
 src/cc/CCClient.cpp                      |    5 +-
 src/crypto/CryptoNight.cpp               |  176 +-
 src/crypto/CryptoNight.h                 |    6 +-
 src/crypto/CryptoNight_arm.h             | 3366 +++++++++++++---------
 src/crypto/CryptoNight_test.h            |   12 +-
 src/crypto/CryptoNight_x86.h             | 1254 +++++---
 src/crypto/SSE2NEON.h                    |    6 +
 src/crypto/soft_aes.h                    |   25 +-
 src/log/Log.h                            |   13 +
 src/version.h                            |    4 +-
 src/workers/Handle.cpp                   |    2 +-
 src/workers/Handle.h                     |   10 +-
 src/workers/MultiWorker.cpp              |   48 +-
 src/workers/MultiWorker.h                |    2 +-
 src/workers/Worker.cpp                   |    5 +-
 src/workers/Worker.h                     |    6 +-
 src/workers/Workers.cpp                  |    9 +-
 src/workers/Workers.h                    |    2 +-
 test/cryptonight/cryptonight.c           |   20 +-
 test/cryptonight_lite/cryptonight_lite.c |   28 +-
 38 files changed, 5525 insertions(+), 3114 deletions(-)
diff --git a/src/3rdparty/cpp-httplib/httplib.h b/src/3rdparty/cpp-httplib/httplib.h
index d3c46c7c..87171e0b 100644
--- a/src/3rdparty/cpp-httplib/httplib.h
+++ b/src/3rdparty/cpp-httplib/httplib.h
@@ -2,30 +2,31 @@
 //  httplib.h
 //
 //  Copyright (c) 2017 Yuji Hirose. All rights reserved.
-//  The Boost Software License 1.0
+//  MIT License
 //
 
 #ifndef _CPPHTTPLIB_HTTPLIB_H_
 #define _CPPHTTPLIB_HTTPLIB_H_
 
-#ifdef _MSC_VER
+#ifdef _WIN32
+#ifndef _CRT_SECURE_NO_WARNINGS
 #define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifndef _CRT_NONSTDC_NO_DEPRECATE
 #define _CRT_NONSTDC_NO_DEPRECATE
+#endif
 
-#ifndef SO_SYNCHRONOUS_NONALERT
-#define SO_SYNCHRONOUS_NONALERT 0x20
-#endif
-#ifndef SO_OPENTYPE
-#define SO_OPENTYPE 0x7008
-#endif
-#if (_MSC_VER < 1900)
+#if defined(_MSC_VER) && _MSC_VER < 1900
 #define snprintf _snprintf_s
 #endif
 
+#ifndef S_ISREG
 #define S_ISREG(m)  (((m)&S_IFREG)==S_IFREG)
+#endif
+#ifndef S_ISDIR
 #define S_ISDIR(m)  (((m)&S_IFDIR)==S_IFDIR)
+#endif
 
-#include <fcntl.h>
 #include <io.h>
 #include <winsock2.h>
 #include <ws2tcpip.h>
@@ -33,717 +34,1238 @@
 #undef min
 #undef max
 
+#ifndef strcasecmp
+#define strcasecmp _stricmp
+#endif
+
 typedef SOCKET socket_t;
 #else
 #include <pthread.h>
 #include <unistd.h>
-
-#if WIN32
-#include <winsock2.h>
-#else
 #include <netdb.h>
+#include <cstring>
 #include <netinet/in.h>
 #include <arpa/inet.h>
-#include <sys/socket.h>
-#endif
-
-#include <cstring>
 #include <signal.h>
+#include <sys/socket.h>
+#include <sys/select.h>
 
 typedef int socket_t;
+#define INVALID_SOCKET (-1)
 #endif
 
 #include <fstream>
 #include <functional>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <regex>
 #include <string>
+#include <thread>
 #include <sys/stat.h>
+#include <fcntl.h>
 #include <assert.h>
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
 #include <openssl/ssl.h>
 #endif
 
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+#include <zlib.h>
+#endif
+
+/*
+ * Configuration
+ */
+#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND 5
+#define CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND 0
+
 namespace httplib
 {
 
-typedef std::map<std::string, std::string>      Map;
-typedef std::multimap<std::string, std::string> MultiMap;
-typedef std::smatch                             Match;
+    namespace detail {
 
-struct Request {
-    std::string method;
-    std::string path;
-    MultiMap    headers;
-    std::string body;
-    Map         params;
-    Match       matches;
+        struct ci {
+            bool operator() (const std::string & s1, const std::string & s2) const {
+                return std::lexicographical_compare(
+                        s1.begin(), s1.end(),
+                        s2.begin(), s2.end(),
+                        [](char c1, char c2) {
+                            return ::tolower(c1) < ::tolower(c2);
+                        });
+            }
+        };
 
-    bool has_header(const char* key) const;
-    std::string get_header_value(const char* key) const;
-    void set_header(const char* key, const char* val);
+    } // namespace detail
 
-    bool has_param(const char* key) const;
-};
+    enum class HttpVersion { v1_0 = 0, v1_1 };
 
-struct Response {
-    int         status;
-    MultiMap    headers;
-    std::string body;
+    typedef std::multimap<std::string, std::string, detail::ci>  Headers;
 
-    bool has_header(const char* key) const;
-    std::string get_header_value(const char* key) const;
-    void set_header(const char* key, const char* val);
+    template<typename uint64_t, typename... Args>
+    std::pair<std::string, std::string> make_range_header(uint64_t value, Args... args);
 
-    void set_redirect(const char* url);
-    void set_content(const char* s, size_t n, const char* content_type);
-    void set_content(const std::string& s, const char* content_type);
+    typedef std::multimap<std::string, std::string>                Params;
+    typedef std::smatch                                            Match;
+    typedef std::function<void (uint64_t current, uint64_t total)> Progress;
 
-    Response() : status(-1) {}
-};
+    struct MultipartFile {
+        std::string filename;
+        std::string content_type;
+        size_t offset = 0;
+        size_t length = 0;
+    };
+    typedef std::multimap<std::string, MultipartFile> MultipartFiles;
 
-class Stream {
-public:
-    virtual ~Stream() {}
-    virtual int read(char* ptr, size_t size) = 0;
-    virtual int write(const char* ptr, size_t size1) = 0;
-    virtual int write(const char* ptr) = 0;
-};
+    struct Request {
+        std::string    version;
+        std::string    method;
+        std::string    target;
+        std::string    path;
+        Headers        headers;
+        std::string    body;
+        Params         params;
+        MultipartFiles files;
+        Match          matches;
 
-class SocketStream : public Stream {
-public:
-    SocketStream(socket_t sock);
-    virtual ~SocketStream();
+        Progress       progress;
 
-    virtual int read(char* ptr, size_t size);
-    virtual int write(const char* ptr, size_t size);
-    virtual int write(const char* ptr);
+        bool has_header(const char* key) const;
+        std::string get_header_value(const char* key) const;
+        void set_header(const char* key, const char* val);
 
-private:
-    socket_t sock_;
-};
+        bool has_param(const char* key) const;
+        std::string get_param_value(const char* key) const;
 
-class Server {
-public:
-    typedef std::function<void (const Request&, Response&)> Handler;
-    typedef std::function<void (const Request&, const Response&)> Logger;
+        bool has_file(const char* key) const;
+        MultipartFile get_file_value(const char* key) const;
+    };
 
-    Server();
-    virtual ~Server();
+    struct Response {
+        std::string version;
+        int         status;
+        Headers     headers;
+        std::string body;
 
-    void get(const char* pattern, Handler handler);
-    void post(const char* pattern, Handler handler);
+        bool has_header(const char* key) const;
+        std::string get_header_value(const char* key) const;
+        void set_header(const char* key, const char* val);
 
-    bool set_base_dir(const char* path);
+        void set_redirect(const char* uri);
+        void set_content(const char* s, size_t n, const char* content_type);
+        void set_content(const std::string& s, const char* content_type);
 
-    void set_error_handler(Handler handler);
-    void set_logger(Logger logger);
+        Response() : status(-1) {}
+    };
 
-    bool listen(const char* host, int port, int socket_flags = 0);
-    void stop();
+    class Stream {
+    public:
+        virtual ~Stream() {}
+        virtual int read(char* ptr, size_t size) = 0;
+        virtual int write(const char* ptr, size_t size1) = 0;
+        virtual int write(const char* ptr) = 0;
+        virtual std::string get_remote_addr() = 0;
 
-protected:
-    void process_request(Stream& strm);
+        template <typename ...Args>
+        void write_format(const char* fmt, const Args& ...args);
+    };
 
-private:
-    typedef std::vector<std::pair<std::regex, Handler>> Handlers;
+    class SocketStream : public Stream {
+    public:
+        SocketStream(socket_t sock);
+        virtual ~SocketStream();
 
-    bool routing(Request& req, Response& res);
-    bool handle_file_request(Request& req, Response& res);
-    bool dispatch_request(Request& req, Response& res, Handlers& handlers);
+        virtual int read(char* ptr, size_t size);
+        virtual int write(const char* ptr, size_t size);
+        virtual int write(const char* ptr);
+        virtual std::string get_remote_addr();
 
-    bool read_request_line(Stream& strm, Request& req);
+    private:
+        socket_t sock_;
+    };
 
-    virtual bool read_and_close_socket(socket_t sock);
+    class Server {
+    public:
+        typedef std::function<void (const Request&, Response&)> Handler;
+        typedef std::function<void (const Request&, const Response&)> Logger;
 
-    socket_t    svr_sock_;
-    std::string base_dir_;
-    Handlers    get_handlers_;
-    Handlers    post_handlers_;
-    Handler     error_handler_;
-    Logger      logger_;
-};
+        Server();
 
-class Client {
-public:
-    Client(const char* host, int port);
-    virtual ~Client();
+        virtual ~Server();
 
-    std::shared_ptr<Response> get(const char* path);
-    std::shared_ptr<Response> head(const char* path);
-    std::shared_ptr<Response> post(const char* path, const std::string& body, const char* content_type);
-    std::shared_ptr<Response> post(const char* path, const Map& params);
+        virtual bool is_valid() const;
 
-    bool send(const Request& req, Response& res);
+        Server& Get(const char* pattern, Handler handler);
+        Server& Post(const char* pattern, Handler handler);
 
-protected:
-    bool process_request(Stream& strm, const Request& req, Response& res);
+        Server& Put(const char* pattern, Handler handler);
+        Server& Delete(const char* pattern, Handler handler);
+        Server& Options(const char* pattern, Handler handler);
 
-    const std::string host_;
-    const int         port_;
-    const std::string host_and_port_;
+        bool set_base_dir(const char* path);
 
-private:
-    bool read_response_line(Stream& strm, Response& res);
-    void add_default_headers(Request& req);
+        void set_error_handler(Handler handler);
+        void set_logger(Logger logger);
 
-    virtual bool read_and_close_socket(socket_t sock, const Request& req, Response& res);
-};
+        void set_keep_alive_max_count(size_t count);
+
+        int bind_to_any_port(const char* host, int socket_flags = 0);
+        bool listen_after_bind();
+
+        bool listen(const char* host, int port, int socket_flags = 0);
+
+        bool is_running() const;
+        void stop();
+
+    protected:
+        bool process_request(Stream& strm, bool last_connection, bool& connection_close);
+
+        size_t keep_alive_max_count_;
+
+    private:
+        typedef std::vector<std::pair<std::regex, Handler>> Handlers;
+
+        socket_t create_server_socket(const char* host, int port, int socket_flags) const;
+        int bind_internal(const char* host, int port, int socket_flags);
+        bool listen_internal();
+
+        bool routing(Request& req, Response& res);
+        bool handle_file_request(Request& req, Response& res);
+        bool dispatch_request(Request& req, Response& res, Handlers& handlers);
+
+        bool parse_request_line(const char* s, Request& req);
+        void write_response(Stream& strm, bool last_connection, const Request& req, Response& res);
+
+        virtual bool read_and_close_socket(socket_t sock);
+
+        bool        is_running_;
+        socket_t    svr_sock_;
+        std::string base_dir_;
+        Handlers    get_handlers_;
+        Handlers    post_handlers_;
+        Handlers    put_handlers_;
+        Handlers    delete_handlers_;
+        Handlers    options_handlers_;
+        Handler     error_handler_;
+        Logger      logger_;
+
+        // TODO: Use thread pool...
+        std::mutex  running_threads_mutex_;
+        int         running_threads_;
+    };
+
+    class Client {
+    public:
+        Client(
+                const char* host,
+                int port = 80,
+                size_t timeout_sec = 300);
+
+        virtual ~Client();
+
+        virtual bool is_valid() const;
+
+        std::shared_ptr<Response> Get(const char* path, Progress progress = nullptr);
+        std::shared_ptr<Response> Get(const char* path, const Headers& headers, Progress progress = nullptr);
+
+        std::shared_ptr<Response> Head(const char* path);
+        std::shared_ptr<Response> Head(const char* path, const Headers& headers);
+
+        std::shared_ptr<Response> Post(const char* path, const std::string& body, const char* content_type);
+        std::shared_ptr<Response> Post(const char* path, const Headers& headers, const std::string& body, const char* content_type);
+
+        std::shared_ptr<Response> Post(const char* path, const Params& params);
+        std::shared_ptr<Response> Post(const char* path, const Headers& headers, const Params& params);
+
+        std::shared_ptr<Response> Put(const char* path, const std::string& body, const char* content_type);
+        std::shared_ptr<Response> Put(const char* path, const Headers& headers, const std::string& body, const char* content_type);
+
+        std::shared_ptr<Response> Delete(const char* path);
+        std::shared_ptr<Response> Delete(const char* path, const Headers& headers);
+
+        std::shared_ptr<Response> Options(const char* path);
+        std::shared_ptr<Response> Options(const char* path, const Headers& headers);
+
+        bool send(Request& req, Response& res);
+
+    protected:
+        bool process_request(Stream& strm, Request& req, Response& res, bool& connection_close);
+
+        const std::string host_;
+        const int         port_;
+        size_t            timeout_sec_;
+        const std::string host_and_port_;
+
+    private:
+        socket_t create_client_socket() const;
+        bool read_response_line(Stream& strm, Response& res);
+        void write_request(Stream& strm, Request& req);
+
+        virtual bool read_and_close_socket(socket_t sock, Request& req, Response& res);
+    };
 
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-class SSLSocketStream : public Stream {
-public:
-    SSLSocketStream(SSL* ssl);
-    virtual ~SSLSocketStream();
+    class SSLSocketStream : public Stream {
+    public:
+        SSLSocketStream(socket_t sock, SSL* ssl);
+        virtual ~SSLSocketStream();
 
-    virtual int read(char* ptr, size_t size);
-    virtual int write(const char* ptr, size_t size);
-    virtual int write(const char* ptr);
+        virtual int read(char* ptr, size_t size);
+        virtual int write(const char* ptr, size_t size);
+        virtual int write(const char* ptr);
+        virtual std::string get_remote_addr();
 
-private:
-    SSL* ssl_;
-};
+    private:
+        socket_t sock_;
+        SSL* ssl_;
+    };
 
-class SSLServer : public Server {
-public:
-    SSLServer(const char* cert_path, const char* private_key_path);
-    virtual ~SSLServer();
+    class SSLServer : public Server {
+    public:
+        SSLServer(
+                const char* cert_path, const char* private_key_path);
 
-private:
-    virtual bool read_and_close_socket(socket_t sock);
+        virtual ~SSLServer();
 
-    SSL_CTX* ctx_;
-};
+        virtual bool is_valid() const;
 
-class SSLClient : public Client {
-public:
-    SSLClient(const char* host, int port);
-    virtual ~SSLClient();
+    private:
+        virtual bool read_and_close_socket(socket_t sock);
 
-private:
-    virtual bool read_and_close_socket(socket_t sock, const Request& req, Response& res);
+        SSL_CTX* ctx_;
+        std::mutex ctx_mutex_;
+    };
 
-    SSL_CTX* ctx_;
-};
+    class SSLClient : public Client {
+    public:
+        SSLClient(
+                const char* host,
+                int port = 80,
+                size_t timeout_sec = 300);
+
+        virtual ~SSLClient();
+
+        virtual bool is_valid() const;
+
+    private:
+        virtual bool read_and_close_socket(socket_t sock, Request& req, Response& res);
+
+        SSL_CTX* ctx_;
+        std::mutex ctx_mutex_;
+    };
 #endif
 
 /*
  * Implementation
  */
-namespace detail {
+    namespace detail {
 
-template <class Fn>
-void split(const char* b, const char* e, char d, Fn fn)
-{
-    int i = 0;
-    int beg = 0;
+        template <class Fn>
+        void split(const char* b, const char* e, char d, Fn fn)
+        {
+            int i = 0;
+            int beg = 0;
 
-    while (e ? (b + i != e) : (b[i] != '\0')) {
-        if (b[i] == d) {
-            fn(&b[beg], &b[i]);
-            beg = i + 1;
-        }
-        i++;
-    }
+            while (e ? (b + i != e) : (b[i] != '\0')) {
+                if (b[i] == d) {
+                    fn(&b[beg], &b[i]);
+                    beg = i + 1;
+                }
+                i++;
+            }
 
-    if (i) {
-        fn(&b[beg], &b[i]);
-    }
-}
-
-inline bool socket_gets(Stream& strm, char* buf, int bufsiz)
-{
-    // TODO: buffering for better performance
-    size_t i = 0;
-
-    for (;;) {
-        char byte;
-        auto n = strm.read(&byte, 1);
-
-        if (n < 1) {
-            if (i == 0) {
-                return false;
-            } else {
-                break;
+            if (i) {
+                fn(&b[beg], &b[i]);
             }
         }
 
-        buf[i++] = byte;
+// NOTE: until the read size reaches `fixed_buffer_size`, use `fixed_buffer`
+// to store data. The call can set memory on stack for performance.
+        class stream_line_reader {
+        public:
+            stream_line_reader(Stream& strm, char* fixed_buffer, size_t fixed_buffer_size)
+                    : strm_(strm)
+                    , fixed_buffer_(fixed_buffer)
+                    , fixed_buffer_size_(fixed_buffer_size) {
+            }
 
-        if (byte == '\n') {
-            break;
-        }
-    }
+            const char* ptr() const {
+                if (glowable_buffer_.empty()) {
+                    return fixed_buffer_;
+                } else {
+                    return glowable_buffer_.data();
+                }
+            }
 
-    buf[i] = '\0';
-    return true;
-}
+            bool getline() {
+                fixed_buffer_used_size_ = 0;
+                glowable_buffer_.clear();
 
-template <typename ...Args>
-inline void socket_printf(Stream& strm, const char* fmt, const Args& ...args)
-{
-    char buf[BUFSIZ];
-    auto n = snprintf(buf, BUFSIZ, fmt, args...);
-    if (n > 0) {
-        if (n >= BUFSIZ) {
-            // TODO: buffer size is not large enough...
-        } else {
-            strm.write(buf, n);
-        }
-    }
-}
+                for (size_t i = 0; ; i++) {
+                    char byte;
+                    auto n = strm_.read(&byte, 1);
 
-inline int close_socket(socket_t sock)
-{
-#if defined(_MSC_VER) || defined(WIN32)
-    return closesocket(sock);
+                    if (n < 0) {
+                        return false;
+                    } else if (n == 0) {
+                        if (i == 0) {
+                            return false;
+                        } else {
+                            break;
+                        }
+                    }
+
+                    append(byte);
+
+                    if (byte == '\n') {
+                        break;
+                    }
+                }
+
+                return true;
+            }
+
+        private:
+            void append(char c) {
+                if (fixed_buffer_used_size_ < fixed_buffer_size_ - 1) {
+                    fixed_buffer_[fixed_buffer_used_size_++] = c;
+                    fixed_buffer_[fixed_buffer_used_size_] = '\0';
+                } else {
+                    if (glowable_buffer_.empty()) {
+                        assert(fixed_buffer_[fixed_buffer_used_size_] == '\0');
+                        glowable_buffer_.assign(fixed_buffer_, fixed_buffer_used_size_);
+                    }
+                    glowable_buffer_ += c;
+                }
+            }
+
+            Stream& strm_;
+            char* fixed_buffer_;
+            const size_t fixed_buffer_size_;
+            size_t fixed_buffer_used_size_;
+            std::string glowable_buffer_;
+        };
+
+        inline int close_socket(socket_t sock)
+        {
+#ifdef _WIN32
+            return closesocket(sock);
 #else
-    return close(sock);
+            return close(sock);
 #endif
-}
+        }
 
-template <typename T>
-inline bool read_and_close_socket(socket_t sock, T callback)
-{
-    SocketStream strm(sock);
-    auto ret = callback(strm);
-    close_socket(sock);
-    return ret;
-}
+        inline int select_read(socket_t sock, size_t sec, size_t usec)
+        {
+            fd_set fds;
+            FD_ZERO(&fds);
+            FD_SET(sock, &fds);
 
-inline int shutdown_socket(socket_t sock)
-{
-#if defined(_MSC_VER) || defined(WIN32)
-    return shutdown(sock, SD_BOTH);
+            timeval tv;
+            tv.tv_sec = sec;
+            tv.tv_usec = usec;
+
+            return select(sock + 1, &fds, NULL, NULL, &tv);
+        }
+
+        inline bool wait_until_socket_is_ready(socket_t sock, size_t sec, size_t usec)
+        {
+            fd_set fdsr;
+            FD_ZERO(&fdsr);
+            FD_SET(sock, &fdsr);
+
+            auto fdsw = fdsr;
+            auto fdse = fdsr;
+
+            timeval tv;
+            tv.tv_sec = sec;
+            tv.tv_usec = usec;
+
+            if (select(sock + 1, &fdsr, &fdsw, &fdse, &tv) < 0) {
+                return false;
+            } else if (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw)) {
+                int error = 0;
+                socklen_t len = sizeof(error);
+                if (getsockopt(sock, SOL_SOCKET, SO_ERROR, (char*)&error, &len) < 0 || error) {
+                    return false;
+                }
+            } else {
+                return false;
+            }
+
+            return true;
+        }
+
+        template <typename T>
+        inline bool read_and_close_socket(socket_t sock, size_t keep_alive_max_count, T callback)
+        {
+            bool ret = false;
+
+            if (keep_alive_max_count > 0) {
+                auto count = keep_alive_max_count;
+                while (count > 0 &&
+                       detail::select_read(sock,
+                                           CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND,
+                                           CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND) > 0) {
+                    SocketStream strm(sock);
+                    auto last_connection = count == 1;
+                    auto connection_close = false;
+
+                    ret = callback(strm, last_connection, connection_close);
+                    if (!ret || connection_close) {
+                        break;
+                    }
+
+                    count--;
+                }
+            } else {
+                SocketStream strm(sock);
+                auto dummy_connection_close = false;
+                ret = callback(strm, true, dummy_connection_close);
+            }
+
+            close_socket(sock);
+            return ret;
+        }
+
+        inline int shutdown_socket(socket_t sock)
+        {
+#ifdef _WIN32
+            return shutdown(sock, SD_BOTH);
 #else
-    return shutdown(sock, SHUT_RDWR);
+            return shutdown(sock, SHUT_RDWR);
 #endif
-}
+        }
+
+        template <typename Fn>
+        socket_t create_socket(const char* host, int port, Fn fn, int socket_flags = 0)
+        {
+#ifdef _WIN32
+            #define SO_SYNCHRONOUS_NONALERT 0x20
+#define SO_OPENTYPE 0x7008
 
-template <typename Fn>
-socket_t create_socket(const char* host, int port, Fn fn, int socket_flags = 0)
-{
-#if defined(_MSC_VER) || defined(WIN32)
     int opt = SO_SYNCHRONOUS_NONALERT;
     setsockopt(INVALID_SOCKET, SOL_SOCKET, SO_OPENTYPE, (char*)&opt, sizeof(opt));
 #endif
 
-    // Get address info
-    struct addrinfo hints;
-    struct addrinfo *result;
+            // Get address info
+            struct addrinfo hints;
+            struct addrinfo *result;
 
-    memset(&hints, 0, sizeof(struct addrinfo));
-    hints.ai_family = AF_UNSPEC;
-    hints.ai_socktype = SOCK_STREAM;
-    hints.ai_flags = socket_flags;
-    hints.ai_protocol = 0;
+            memset(&hints, 0, sizeof(struct addrinfo));
+            hints.ai_family = AF_UNSPEC;
+            hints.ai_socktype = SOCK_STREAM;
+            hints.ai_flags = socket_flags;
+            hints.ai_protocol = 0;
 
-    auto service = std::to_string(port);
+            auto service = std::to_string(port);
 
-    if (getaddrinfo(host, service.c_str(), &hints, &result)) {
-        return -1;
-    }
-
-    for (auto rp = result; rp; rp = rp->ai_next) {
-       // Create a socket
-       auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
-       if (sock == -1) {
-          continue;
-       }
-
-       // Make 'reuse address' option available
-       int yes = 1;
-       setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&yes, sizeof(yes));
-
-       struct timeval timeout;
-       timeout.tv_sec = 10;
-       timeout.tv_usec = 0;
-
-       setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
-       setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
-
-       // bind or connect
-       if (fn(sock, *rp)) {
-          freeaddrinfo(result);
-          return sock;
-       }
-
-       close_socket(sock);
-    }
-
-    freeaddrinfo(result);
-    return -1;
-}
-
-inline socket_t create_server_socket(const char* host, int port, int socket_flags)
-{
-    return create_socket(host, port, [](socket_t sock, struct addrinfo& ai) -> socket_t {
-        if (::bind(sock, ai.ai_addr, ai.ai_addrlen)) {
-              return false;
-        }
-        if (listen(sock, 5)) { // Listen through 5 channels
-            return false;
-        }
-        return true;
-    }, socket_flags);
-}
-
-inline socket_t create_client_socket(const char* host, int port)
-{
-    return create_socket(host, port, [](socket_t sock, struct addrinfo& ai) -> socket_t {
-        if (connect(sock, ai.ai_addr, ai.ai_addrlen)) {
-            return false;
-        }
-        return true;
-    });
-}
-
-inline bool is_file(const std::string& s)
-{
-    struct stat st;
-    return stat(s.c_str(), &st) >= 0 && S_ISREG(st.st_mode);
-}
-
-inline bool is_dir(const std::string& s)
-{
-    struct stat st;
-    return stat(s.c_str(), &st) >= 0 && S_ISDIR(st.st_mode);
-}
-
-inline void read_file(const std::string& path, std::string& out)
-{
-    std::ifstream fs(path, std::ios_base::binary);
-    fs.seekg(0, std::ios_base::end);
-    auto size = fs.tellg();
-    fs.seekg(0);
-    out.resize(static_cast<size_t>(size));
-    fs.read(&out[0], size);
-}
-
-inline std::string file_extension(const std::string& path)
-{
-    std::smatch m;
-    auto pat = std::regex("\\.([a-zA-Z0-9]+)$");
-    if (std::regex_search(path, m, pat)) {
-        return m[1].str();
-    }
-    return std::string();
-}
-
-inline const char* content_type(const std::string& path)
-{
-    auto ext = detail::file_extension(path);
-    if (ext == "txt") {
-        return "text/plain";
-    } else if (ext == "html") {
-        return "text/html";
-    } else if (ext == "js") {
-        return "text/javascript";
-    } else if (ext == "css") {
-        return "text/css";
-    } else if (ext == "xml") {
-        return "text/xml";
-    } else if (ext == "jpeg" || ext == "jpg") {
-        return "image/jpg";
-    } else if (ext == "png") {
-        return "image/png";
-    } else if (ext == "gif") {
-        return "image/gif";
-    } else if (ext == "svg") {
-        return "image/svg+xml";
-    } else if (ext == "ico") {
-        return "image/x-icon";
-    } else if (ext == "json") {
-        return "application/json";
-    } else if (ext == "pdf") {
-        return "application/pdf";
-    } else if (ext == "xhtml") {
-        return "application/xhtml+xml";
-    }
-    return nullptr;
-}
-
-inline const char* status_message(int status)
-{
-    switch (status) {
-    case 200: return "OK";
-    case 400: return "Bad Request";
-    case 404: return "Not Found";
-    default:
-        case 500: return "Internal Server Error";
-    }
-}
-
-inline const char* get_header_value(const MultiMap& map, const char* key, const char* def)
-{
-    auto it = map.find(key);
-    if (it != map.end()) {
-        return it->second.c_str();
-    }
-    return def;
-}
-
-inline int get_header_value_int(const MultiMap& map, const char* key, int def)
-{
-    auto it = map.find(key);
-    if (it != map.end()) {
-        return std::stoi(it->second);
-    }
-    return def;
-}
-
-inline bool read_headers(Stream& strm, MultiMap& headers)
-{
-    static std::regex re("(.+?): (.+?)\r\n");
-
-    const auto BUFSIZ_HEADER = 2048;
-    char buf[BUFSIZ_HEADER];
-
-    for (;;) {
-        if (!socket_gets(strm, buf, BUFSIZ_HEADER)) {
-            return false;
-        }
-        if (!strcmp(buf, "\r\n")) {
-            break;
-        }
-        std::cmatch m;
-        if (std::regex_match(buf, m, re)) {
-            auto key = std::string(m[1]);
-            auto val = std::string(m[2]);
-            headers.insert(std::make_pair(key, val));
-        }
-    }
-
-    return true;
-}
-
-template <typename T>
-bool read_content(Stream& strm, T& x, bool allow_no_content_length)
-{
-    auto len = get_header_value_int(x.headers, "Content-Length", 0);
-    if (len) {
-        x.body.assign(len, 0);
-        auto r = 0;
-        while (r < len){
-            auto r_incr = strm.read(&x.body[r], len - r);
-            if (r_incr <= 0) {
-                return false;
+            if (getaddrinfo(host, service.c_str(), &hints, &result)) {
+                return INVALID_SOCKET;
             }
-            r += r_incr;
+
+            for (auto rp = result; rp; rp = rp->ai_next) {
+                // Create a socket
+                auto sock = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol);
+                if (sock == INVALID_SOCKET) {
+                    continue;
+                }
+
+                // Make 'reuse address' option available
+                int yes = 1;
+                setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char*)&yes, sizeof(yes));
+
+                // Make socket also having a timeout
+                struct timeval timeout;
+                timeout.tv_sec = CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND;
+                timeout.tv_usec = CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND;
+
+                setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, sizeof(timeout));
+                setsockopt (sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, sizeof(timeout));
+
+                // bind or connect
+                if (fn(sock, *rp)) {
+                    freeaddrinfo(result);
+                    return sock;
+                }
+
+                close_socket(sock);
+            }
+
+            freeaddrinfo(result);
+            return INVALID_SOCKET;
         }
-    } else if (allow_no_content_length) {
-        for (;;) {
-            char byte;
-            auto n = strm.read(&byte, 1);
-            if (n < 1) {
-                if (x.body.size() == 0) {
-                    return true; // no body
+
+        inline void set_nonblocking(socket_t sock, bool nonblocking)
+        {
+#ifdef _WIN32
+            auto flags = nonblocking ? 1UL : 0UL;
+    ioctlsocket(sock, FIONBIO, &flags);
+#else
+            auto flags = fcntl(sock, F_GETFL, 0);
+            fcntl(sock, F_SETFL, nonblocking ? (flags | O_NONBLOCK) : (flags & (~O_NONBLOCK)));
+#endif
+        }
+
+        inline bool is_connection_error()
+        {
+#ifdef _WIN32
+            return WSAGetLastError() != WSAEWOULDBLOCK;
+#else
+            return errno != EINPROGRESS;
+#endif
+        }
+
+        inline std::string get_remote_addr(socket_t sock) {
+            struct sockaddr_storage addr;
+            socklen_t len = sizeof(addr);
+
+            if (!getpeername(sock, (struct sockaddr*)&addr, &len)) {
+                char ipstr[NI_MAXHOST];
+
+                if (!getnameinfo((struct sockaddr*)&addr, len,
+                                 ipstr, sizeof(ipstr), nullptr, 0, NI_NUMERICHOST)) {
+                    return ipstr;
+                }
+            }
+
+            return std::string();
+        }
+
+        inline bool is_file(const std::string& path)
+        {
+            struct stat st;
+            return stat(path.c_str(), &st) >= 0 && S_ISREG(st.st_mode);
+        }
+
+        inline bool is_dir(const std::string& path)
+        {
+            struct stat st;
+            return stat(path.c_str(), &st) >= 0 && S_ISDIR(st.st_mode);
+        }
+
+        inline bool is_valid_path(const std::string& path) {
+            size_t level = 0;
+            size_t i = 0;
+
+            // Skip slash
+            while (i < path.size() && path[i] == '/') {
+                i++;
+            }
+
+            while (i < path.size()) {
+                // Read component
+                auto beg = i;
+                while (i < path.size() && path[i] != '/') {
+                    i++;
+                }
+
+                auto len = i - beg;
+                assert(len > 0);
+
+                if (!path.compare(beg, len, ".")) {
+                    ;
+                } else if (!path.compare(beg, len, "..")) {
+                    if (level == 0) {
+                        return false;
+                    }
+                    level--;
                 } else {
+                    level++;
+                }
+
+                // Skip slash
+                while (i < path.size() && path[i] == '/') {
+                    i++;
+                }
+            }
+
+            return true;
+        }
+
+        inline void read_file(const std::string& path, std::string& out)
+        {
+            std::ifstream fs(path, std::ios_base::binary);
+            fs.seekg(0, std::ios_base::end);
+            auto size = fs.tellg();
+            fs.seekg(0);
+            out.resize(static_cast<size_t>(size));
+            fs.read(&out[0], size);
+        }
+
+        inline std::string file_extension(const std::string& path)
+        {
+            std::smatch m;
+            auto pat = std::regex("\\.([a-zA-Z0-9]+)$");
+            if (std::regex_search(path, m, pat)) {
+                return m[1].str();
+            }
+            return std::string();
+        }
+
+        inline const char* find_content_type(const std::string& path)
+        {
+            auto ext = file_extension(path);
+            if (ext == "txt") {
+                return "text/plain";
+            } else if (ext == "html") {
+                return "text/html";
+            } else if (ext == "css") {
+                return "text/css";
+            } else if (ext == "jpeg" || ext == "jpg") {
+                return "image/jpg";
+            } else if (ext == "png") {
+                return "image/png";
+            } else if (ext == "gif") {
+                return "image/gif";
+            } else if (ext == "svg") {
+                return "image/svg+xml";
+            } else if (ext == "ico") {
+                return "image/x-icon";
+            } else if (ext == "json") {
+                return "application/json";
+            } else if (ext == "pdf") {
+                return "application/pdf";
+            } else if (ext == "js") {
+                return "application/javascript";
+            } else if (ext == "xml") {
+                return "application/xml";
+            } else if (ext == "xhtml") {
+                return "application/xhtml+xml";
+            }
+            return nullptr;
+        }
+
+        inline const char* status_message(int status)
+        {
+            switch (status) {
+                case 200: return "OK";
+                case 301: return "Moved Permanently";
+                case 302: return "Found";
+                case 303: return "See Other";
+                case 304: return "Not Modified";
+                case 400: return "Bad Request";
+                case 403: return "Forbidden";
+                case 404: return "Not Found";
+                case 415: return "Unsupported Media Type";
+                default:
+                case 500: return "Internal Server Error";
+            }
+        }
+
+        inline const char* get_header_value(const Headers& headers, const char* key, const char* def)
+        {
+            auto it = headers.find(key);
+            if (it != headers.end()) {
+                return it->second.c_str();
+            }
+            return def;
+        }
+
+        inline int get_header_value_int(const Headers& headers, const char* key, int def)
+        {
+            auto it = headers.find(key);
+            if (it != headers.end()) {
+                return std::stoi(it->second);
+            }
+            return def;
+        }
+
+        inline bool read_headers(Stream& strm, Headers& headers)
+        {
+            static std::regex re(R"((.+?):\s*(.+?)\s*\r\n)");
+
+            const auto bufsiz = 2048;
+            char buf[bufsiz];
+
+            stream_line_reader reader(strm, buf, bufsiz);
+
+            for (;;) {
+                if (!reader.getline()) {
+                    return false;
+                }
+                if (!strcmp(reader.ptr(), "\r\n")) {
                     break;
                 }
-            }
-            x.body += byte;
-        }
-    }
-    return true;
-}
-
-template <typename T>
-inline void write_headers(Stream& strm, const T& res)
-{
-    strm.write("Connection: close\r\n");
-
-    for (const auto& x: res.headers) {
-        if (x.first != "Content-Type" && x.first != "Content-Length") {
-            socket_printf(strm, "%s: %s\r\n", x.first.c_str(), x.second.c_str());
-        }
-    }
-
-    auto t = get_header_value(res.headers, "Content-Type", "text/plain");
-    socket_printf(strm, "Content-Type: %s\r\n", t);
-    socket_printf(strm, "Content-Length: %ld\r\n", res.body.size());
-    strm.write("\r\n");
-}
-
-inline void write_response(Stream& strm, const Request& req, const Response& res)
-{
-    socket_printf(strm, "HTTP/1.0 %d %s\r\n", res.status, status_message(res.status));
-
-    write_headers(strm, res);
-
-    if (!res.body.empty() && req.method != "HEAD") {
-        strm.write(res.body.c_str(), res.body.size());
-    }
-}
-
-inline std::string encode_url(const std::string& s)
-{
-    std::string result;
-
-    for (auto i = 0; s[i]; i++) {
-        switch (s[i]) {
-        case ' ':  result += "+"; break;
-        case '\'': result += "%27"; break;
-        case ',':  result += "%2C"; break;
-        case ':':  result += "%3A"; break;
-        case ';':  result += "%3B"; break;
-        default:
-            if (s[i] < 0) {
-                result += '%';
-                char hex[4];
-                size_t len = snprintf(hex, sizeof(hex), "%02X", (unsigned char)s[i]);
-                assert(len == 2);
-                result.append(hex, len);
-            } else {
-                result += s[i];
-            }
-            break;
-        }
-   }
-
-    return result;
-}
-
-inline bool is_hex(char c, int& v)
-{
-    if (0x20 <= c && isdigit(c)) {
-        v = c - '0';
-        return true;
-    } else if ('A' <= c && c <= 'F') {
-        v = c - 'A' + 10;
-        return true;
-    } else if ('a' <= c && c <= 'f') {
-        v = c - 'a' + 10;
-        return true;
-    }
-    return false;
-}
-
-inline int from_hex_to_i(const std::string& s, int i, int cnt, int& val)
-{
-    val = 0;
-    for (; s[i] && cnt; i++, cnt--) {
-        int v = 0;
-        if (is_hex(s[i], v)) {
-            val = val * 16 + v;
-        } else {
-            break;
-        }
-    }
-    return --i;
-}
-
-inline size_t to_utf8(int code, char* buff)
-{
-    if (code < 0x0080) {
-        buff[0] = (code & 0x7F);
-        return 1;
-    } else if (code < 0x0800) {
-        buff[0] = (0xC0 | ((code >> 6) & 0x1F));
-        buff[1] = (0x80 | (code & 0x3F));
-        return 2;
-    } else if (code < 0xD800) {
-        buff[0] = (0xE0 | ((code >> 12) & 0xF));
-        buff[1] = (0x80 | ((code >> 6) & 0x3F));
-        buff[2] = (0x80 | (code & 0x3F));
-        return 3;
-    } else if (code < 0xE000)  { // D800 - DFFF is invalid...
-        return 0;
-    } else if (code < 0x10000) {
-        buff[0] = (0xE0 | ((code >> 12) & 0xF));
-        buff[1] = (0x80 | ((code >> 6) & 0x3F));
-        buff[2] = (0x80 | (code & 0x3F));
-        return 3;
-    } else if (code < 0x110000) {
-        buff[0] = (0xF0 | ((code >> 18) & 0x7));
-        buff[1] = (0x80 | ((code >> 12) & 0x3F));
-        buff[2] = (0x80 | ((code >> 6) & 0x3F));
-        buff[3] = (0x80 | (code & 0x3F));
-        return 4;
-    }
-
-    // NOTREACHED
-    return 0;
-}
-
-inline std::string decode_url(const std::string& s)
-{
-    std::string result;
-
-    for (int i = 0; s[i]; i++) {
-        if (s[i] == '%') {
-            i++;
-            assert(s[i]);
-
-            if (s[i] == '%') {
-                result += s[i];
-            } else if (s[i] == 'u') {
-                // Unicode
-                i++;
-                assert(s[i]);
-
-                int val = 0;
-                i = from_hex_to_i(s, i, 4, val);
-
-                char buff[4];
-                size_t len = to_utf8(val, buff);
-
-                if (len > 0) {
-                    result.append(buff, len);
+                std::cmatch m;
+                if (std::regex_match(reader.ptr(), m, re)) {
+                    auto key = std::string(m[1]);
+                    auto val = std::string(m[2]);
+                    headers.emplace(key, val);
                 }
-            } else {
-                // HEX
-                int val = 0;
-                i = from_hex_to_i(s, i, 2, val);
-                result += val;
             }
-        } else if (s[i] == '+') {
-            result += ' ';
-        } else {
-            result += s[i];
+
+            return true;
         }
+
+        inline bool read_content_with_length(Stream& strm, std::string& out, size_t len, Progress progress)
+        {
+            out.assign(len, 0);
+            size_t r = 0;
+            while (r < len){
+                auto n = strm.read(&out[r], len - r);
+                if (n <= 0) {
+                    return false;
+                }
+
+                r += n;
+
+                if (progress) {
+                    progress(r, len);
+                }
+            }
+
+            return true;
+        }
+
+        inline bool read_content_without_length(Stream& strm, std::string& out)
+        {
+            for (;;) {
+                char byte;
+                auto n = strm.read(&byte, 1);
+                if (n < 0) {
+                    return false;
+                } else if (n == 0) {
+                    return true;
+                }
+                out += byte;
+            }
+
+            return true;
+        }
+
+        inline bool read_content_chunked(Stream& strm, std::string& out)
+        {
+            const auto bufsiz = 16;
+            char buf[bufsiz];
+
+            stream_line_reader reader(strm, buf, bufsiz);
+
+            if (!reader.getline()) {
+                return false;
+            }
+
+            auto chunk_len = std::stoi(reader.ptr(), 0, 16);
+
+            while (chunk_len > 0){
+                std::string chunk;
+                if (!read_content_with_length(strm, chunk, chunk_len, nullptr)) {
+                    return false;
+                }
+
+                if (!reader.getline()) {
+                    return false;
+                }
+
+                if (strcmp(reader.ptr(), "\r\n")) {
+                    break;
+                }
+
+                out += chunk;
+
+                if (!reader.getline()) {
+                    return false;
+                }
+
+                chunk_len = std::stoi(reader.ptr(), 0, 16);
+            }
+
+            if (chunk_len == 0) {
+                // Reader terminator after chunks
+                if (!reader.getline() || strcmp(reader.ptr(), "\r\n"))
+                    return false;
+            }
+
+            return true;
+        }
+
+        template <typename T>
+        bool read_content(Stream& strm, T& x, Progress progress = Progress())
+        {
+            auto len = get_header_value_int(x.headers, "Content-Length", 0);
+
+            if (len) {
+                return read_content_with_length(strm, x.body, len, progress);
+            } else {
+                const auto& encoding = get_header_value(x.headers, "Transfer-Encoding", "");
+
+                if (!strcasecmp(encoding, "chunked")) {
+                    return read_content_chunked(strm, x.body);
+                } else {
+                    return read_content_without_length(strm, x.body);
+                }
+            }
+
+            return true;
+        }
+
+        template <typename T>
+        inline void write_headers(Stream& strm, const T& info)
+        {
+            for (const auto& x: info.headers) {
+                strm.write_format("%s: %s\r\n", x.first.c_str(), x.second.c_str());
+            }
+            strm.write("\r\n");
+        }
+
+        inline std::string encode_url(const std::string& s)
+        {
+            std::string result;
+
+            for (auto i = 0; s[i]; i++) {
+                switch (s[i]) {
+                    case ' ':  result += "+"; break;
+                    case '\'': result += "%27"; break;
+                    case ',':  result += "%2C"; break;
+                    case ':':  result += "%3A"; break;
+                    case ';':  result += "%3B"; break;
+                    default:
+                        if (s[i] < 0) {
+                            result += '%';
+                            char hex[4];
+                            size_t len = snprintf(hex, sizeof(hex) - 1, "%02X", (unsigned char)s[i]);
+                            assert(len == 2);
+                            result.append(hex, len);
+                        } else {
+                            result += s[i];
+                        }
+                        break;
+                }
+            }
+
+            return result;
+        }
+
+        inline bool is_hex(char c, int& v)
+        {
+            if (0x20 <= c && isdigit(c)) {
+                v = c - '0';
+                return true;
+            } else if ('A' <= c && c <= 'F') {
+                v = c - 'A' + 10;
+                return true;
+            } else if ('a' <= c && c <= 'f') {
+                v = c - 'a' + 10;
+                return true;
+            }
+            return false;
+        }
+
+        inline bool from_hex_to_i(const std::string& s, size_t i, size_t cnt, int& val)
+        {
+            if (i >= s.size()) {
+                return false;
+            }
+
+            val = 0;
+            for (; cnt; i++, cnt--) {
+                if (!s[i]) {
+                    return false;
+                }
+                int v = 0;
+                if (is_hex(s[i], v)) {
+                    val = val * 16 + v;
+                } else {
+                    return false;
+                }
+            }
+            return true;
+        }
+
+        inline size_t to_utf8(int code, char* buff)
+        {
+            if (code < 0x0080) {
+                buff[0] = (code & 0x7F);
+                return 1;
+            } else if (code < 0x0800) {
+                buff[0] = (0xC0 | ((code >> 6) & 0x1F));
+                buff[1] = (0x80 | (code & 0x3F));
+                return 2;
+            } else if (code < 0xD800) {
+                buff[0] = (0xE0 | ((code >> 12) & 0xF));
+                buff[1] = (0x80 | ((code >> 6) & 0x3F));
+                buff[2] = (0x80 | (code & 0x3F));
+                return 3;
+            } else if (code < 0xE000)  { // D800 - DFFF is invalid...
+                return 0;
+            } else if (code < 0x10000) {
+                buff[0] = (0xE0 | ((code >> 12) & 0xF));
+                buff[1] = (0x80 | ((code >> 6) & 0x3F));
+                buff[2] = (0x80 | (code & 0x3F));
+                return 3;
+            } else if (code < 0x110000) {
+                buff[0] = (0xF0 | ((code >> 18) & 0x7));
+                buff[1] = (0x80 | ((code >> 12) & 0x3F));
+                buff[2] = (0x80 | ((code >> 6) & 0x3F));
+                buff[3] = (0x80 | (code & 0x3F));
+                return 4;
+            }
+
+            // NOTREACHED
+            return 0;
+        }
+
+        inline std::string decode_url(const std::string& s)
+        {
+            std::string result;
+
+            for (size_t i = 0; i < s.size(); i++) {
+                if (s[i] == '%' && i + 1 < s.size()) {
+                    if (s[i + 1] == 'u') {
+                        int val = 0;
+                        if (from_hex_to_i(s, i + 2, 4, val)) {
+                            // 4 digits Unicode codes
+                            char buff[4];
+                            size_t len = to_utf8(val, buff);
+                            if (len > 0) {
+                                result.append(buff, len);
+                            }
+                            i += 5; // 'u0000'
+                        } else {
+                            result += s[i];
+                        }
+                    } else {
+                        int val = 0;
+                        if (from_hex_to_i(s, i + 1, 2, val)) {
+                            // 2 digits hex codes
+                            result += val;
+                            i += 2; // '00'
+                        } else {
+                            result += s[i];
+                        }
+                    }
+                } else if (s[i] == '+') {
+                    result += ' ';
+                } else {
+                    result += s[i];
+                }
+            }
+
+            return result;
+        }
+
+        inline void parse_query_text(const std::string& s, Params& params)
+        {
+            split(&s[0], &s[s.size()], '&', [&](const char* b, const char* e) {
+                std::string key;
+                std::string val;
+                split(b, e, '=', [&](const char* b, const char* e) {
+                    if (key.empty()) {
+                        key.assign(b, e);
+                    } else {
+                        val.assign(b, e);
+                    }
+                });
+                params.emplace(key, decode_url(val));
+            });
+        }
+
+        inline bool parse_multipart_boundary(const std::string& content_type, std::string& boundary)
+        {
+            auto pos = content_type.find("boundary=");
+            if (pos == std::string::npos) {
+                return false;
+            }
+
+            boundary = content_type.substr(pos + 9);
+            return true;
+        }
+
+        inline bool parse_multipart_formdata(
+                const std::string& boundary, const std::string& body, MultipartFiles& files)
+        {
+            static std::string dash = "--";
+            static std::string crlf = "\r\n";
+
+            static std::regex re_content_type(
+                    "Content-Type: (.*?)", std::regex_constants::icase);
+
+            static std::regex re_content_disposition(
+                    "Content-Disposition: form-data; name=\"(.*?)\"(?:; filename=\"(.*?)\")?",
+                    std::regex_constants::icase);
+
+            auto dash_boundary = dash + boundary;
+
+            auto pos = body.find(dash_boundary);
+            if (pos != 0) {
+                return false;
+            }
+
+            pos += dash_boundary.size();
+
+            auto next_pos = body.find(crlf, pos);
+            if (next_pos == std::string::npos) {
+                return false;
+            }
+
+            pos = next_pos + crlf.size();
+
+            while (pos < body.size()) {
+                next_pos = body.find(crlf, pos);
+                if (next_pos == std::string::npos) {
+                    return false;
+                }
+
+                std::string name;
+                MultipartFile file;
+
+                auto header = body.substr(pos, (next_pos - pos));
+
+                while (pos != next_pos) {
+                    std::smatch m;
+                    if (std::regex_match(header, m, re_content_type)) {
+                        file.content_type = m[1];
+                    } else if (std::regex_match(header, m, re_content_disposition)) {
+                        name = m[1];
+                        file.filename = m[2];
+                    }
+
+                    pos = next_pos + crlf.size();
+
+                    next_pos = body.find(crlf, pos);
+                    if (next_pos == std::string::npos) {
+                        return false;
+                    }
+
+                    header = body.substr(pos, (next_pos - pos));
+                }
+
+                pos = next_pos + crlf.size();
+
+                next_pos = body.find(crlf + dash_boundary, pos);
+
+                if (next_pos == std::string::npos) {
+                    return false;
+                }
+
+                file.offset = pos;
+                file.length = next_pos - pos;
+
+                pos = next_pos + crlf.size() + dash_boundary.size();
+
+                next_pos = body.find(crlf, pos);
+                if (next_pos == std::string::npos) {
+                    return false;
+                }
+
+                files.emplace(name, file);
+
+                pos = next_pos + crlf.size();
+            }
+
+            return true;
+        }
+
+        inline std::string to_lower(const char* beg, const char* end)
+        {
+            std::string out;
+            auto it = beg;
+            while (it != end) {
+                out += ::tolower(*it);
+                it++;
+            }
+            return out;
+        }
+
+        inline void make_range_header_core(std::string&) {}
+
+        template<typename uint64_t>
+        inline void make_range_header_core(std::string& field, uint64_t value)
+        {
+            if (!field.empty()) {
+                field += ", ";
+            }
+            field += std::to_string(value) + "-";
+        }
+
+        template<typename uint64_t, typename... Args>
+        inline void make_range_header_core(std::string& field, uint64_t value1, uint64_t value2, Args... args)
+        {
+            if (!field.empty()) {
+                field += ", ";
+            }
+            field += std::to_string(value1) + "-" + std::to_string(value2);
+            make_range_header_core(field, args...);
+        }
+
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+        inline bool can_compress(const std::string& content_type) {
+    return !content_type.find("text/") ||
+        content_type == "image/svg+xml" ||
+        content_type == "application/javascript" ||
+        content_type == "application/json" ||
+        content_type == "application/xml" ||
+        content_type == "application/xhtml+xml";
+}
+
+inline void compress(std::string& content)
+{
+    z_stream strm;
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
+
+    auto ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED, 31, 8, Z_DEFAULT_STRATEGY);
+    if (ret != Z_OK) {
+        return;
     }
 
-    return result;
+    strm.avail_in = content.size();
+    strm.next_in = (Bytef *)content.data();
+
+    std::string compressed;
+
+    const auto bufsiz = 16384;
+    char buff[bufsiz];
+    do {
+        strm.avail_out = bufsiz;
+        strm.next_out = (Bytef *)buff;
+        deflate(&strm, Z_FINISH);
+        compressed.append(buff, bufsiz - strm.avail_out);
+    } while (strm.avail_out == 0);
+
+    content.swap(compressed);
+
+    deflateEnd(&strm);
 }
 
-inline void write_request(Stream& strm, const Request& req)
+inline void decompress(std::string& content)
 {
-    auto path = encode_url(req.path);
-    socket_printf(strm, "%s %s HTTP/1.0\r\n", req.method.c_str(), path.c_str());
+    z_stream strm;
+    strm.zalloc = Z_NULL;
+    strm.zfree = Z_NULL;
+    strm.opaque = Z_NULL;
 
-    write_headers(strm, req);
-
-    if (!req.body.empty()) {
-        if (req.has_header("application/x-www-form-urlencoded")) {
-            auto str = encode_url(req.body);
-            strm.write(str.c_str(), str.size());
-        } else {
-            strm.write(req.body.c_str(), req.body.size());
-        }
+    // 15 is the value of wbits, which should be at the maximum possible value to ensure
+    // that any gzip stream can be decoded. The offset of 16 specifies that the stream
+    // to decompress will be formatted with a gzip wrapper.
+    auto ret = inflateInit2(&strm, 16 + 15);
+    if (ret != Z_OK) {
+        return;
     }
-}
 
-inline void parse_query_text(const std::string& s, Map& params)
-{
-    split(&s[0], &s[s.size()], '&', [&](const char* b, const char* e) {
-        std::string key;
-        std::string val;
-        split(b, e, '=', [&](const char* b, const char* e) {
-            if (key.empty()) {
-                key.assign(b, e);
-            } else {
-                val.assign(b, e);
-            }
-        });
-        params[key] = detail::decode_url(val);
-    });
-}
+    strm.avail_in = content.size();
+    strm.next_in = (Bytef *)content.data();
 
-#ifdef _MSC_VER
-class WSInit {
+    std::string decompressed;
+
+    const auto bufsiz = 16384;
+    char buff[bufsiz];
+    do {
+        strm.avail_out = bufsiz;
+        strm.next_out = (Bytef *)buff;
+        inflate(&strm, Z_NO_FLUSH);
+        decompressed.append(buff, bufsiz - strm.avail_out);
+    } while (strm.avail_out == 0);
+
+    content.swap(decompressed);
+
+    inflateEnd(&strm);
+}
+#endif
+
+#ifdef _WIN32
+        class WSInit {
 public:
     WSInit() {
         WSADATA wsaData;
@@ -758,557 +1280,1073 @@ public:
 static WSInit wsinit_;
 #endif
 
-} // namespace detail
+    } // namespace detail
+
+// Header utilities
+    template<typename uint64_t, typename... Args>
+    inline std::pair<std::string, std::string> make_range_header(uint64_t value, Args... args)
+    {
+        std::string field;
+        detail::make_range_header_core(field, value, args...);
+        field.insert(0, "bytes=");
+        return std::make_pair("Range", field);
+    }
 
 // Request implementation
-inline bool Request::has_header(const char* key) const
-{
-    return headers.find(key) != headers.end();
-}
+    inline bool Request::has_header(const char* key) const
+    {
+        return headers.find(key) != headers.end();
+    }
 
-inline std::string Request::get_header_value(const char* key) const
-{
-    return detail::get_header_value(headers, key, "");
-}
+    inline std::string Request::get_header_value(const char* key) const
+    {
+        return detail::get_header_value(headers, key, "");
+    }
 
-inline void Request::set_header(const char* key, const char* val)
-{
-    headers.insert(std::make_pair(key, val));
-}
+    inline void Request::set_header(const char* key, const char* val)
+    {
+        headers.emplace(key, val);
+    }
 
-inline bool Request::has_param(const char* key) const
-{
-    return params.find(key) != params.end();
-}
+    inline bool Request::has_param(const char* key) const
+    {
+        return params.find(key) != params.end();
+    }
+
+    inline std::string Request::get_param_value(const char* key) const
+    {
+        auto it = params.find(key);
+        if (it != params.end()) {
+            return it->second;
+        }
+        return std::string();
+    }
+
+    inline bool Request::has_file(const char* key) const
+    {
+        return files.find(key) != files.end();
+    }
+
+    inline MultipartFile Request::get_file_value(const char* key) const
+    {
+        auto it = files.find(key);
+        if (it != files.end()) {
+            return it->second;
+        }
+        return MultipartFile();
+    }
 
 // Response implementation
-inline bool Response::has_header(const char* key) const
-{
-    return headers.find(key) != headers.end();
-}
+    inline bool Response::has_header(const char* key) const
+    {
+        return headers.find(key) != headers.end();
+    }
 
-inline std::string Response::get_header_value(const char* key) const
-{
-    return detail::get_header_value(headers, key, "");
-}
+    inline std::string Response::get_header_value(const char* key) const
+    {
+        return detail::get_header_value(headers, key, "");
+    }
 
-inline void Response::set_header(const char* key, const char* val)
-{
-    headers.insert(std::make_pair(key, val));
-}
+    inline void Response::set_header(const char* key, const char* val)
+    {
+        headers.emplace(key, val);
+    }
 
-inline void Response::set_redirect(const char* url)
-{
-    set_header("Location", url);
-    status = 302;
-}
+    inline void Response::set_redirect(const char* url)
+    {
+        set_header("Location", url);
+        status = 302;
+    }
 
-inline void Response::set_content(const char* s, size_t n, const char* content_type)
-{
-    body.assign(s, n);
-    set_header("Content-Type", content_type);
-}
+    inline void Response::set_content(const char* s, size_t n, const char* content_type)
+    {
+        body.assign(s, n);
+        set_header("Content-Type", content_type);
+    }
 
-inline void Response::set_content(const std::string& s, const char* content_type)
-{
-    body = s;
-    set_header("Content-Type", content_type);
-}
+    inline void Response::set_content(const std::string& s, const char* content_type)
+    {
+        body = s;
+        set_header("Content-Type", content_type);
+    }
+
+// Rstream implementation
+    template <typename ...Args>
+    inline void Stream::write_format(const char* fmt, const Args& ...args)
+    {
+        const auto bufsiz = 2048;
+        char buf[bufsiz];
+
+#if defined(_MSC_VER) && _MSC_VER < 1900
+        auto n = _snprintf_s(buf, bufsiz, bufsiz - 1, fmt, args...);
+#else
+        auto n = snprintf(buf, bufsiz - 1, fmt, args...);
+#endif
+        if (n > 0) {
+            if (n >= bufsiz - 1) {
+                std::vector<char> glowable_buf(bufsiz);
+
+                while (n >= static_cast<int>(glowable_buf.size() - 1)) {
+                    glowable_buf.resize(glowable_buf.size() * 2);
+#if defined(_MSC_VER) && _MSC_VER < 1900
+                    n = _snprintf_s(&glowable_buf[0], glowable_buf.size(), glowable_buf.size() - 1, fmt, args...);
+#else
+                    n = snprintf(&glowable_buf[0], glowable_buf.size() - 1, fmt, args...);
+#endif
+                }
+                write(&glowable_buf[0], n);
+            } else {
+                write(buf, n);
+            }
+        }
+    }
 
 // Socket stream implementation
-inline SocketStream::SocketStream(socket_t sock): sock_(sock)
-{
-}
+    inline SocketStream::SocketStream(socket_t sock): sock_(sock)
+    {
+    }
 
-inline SocketStream::~SocketStream()
-{
-}
+    inline SocketStream::~SocketStream()
+    {
+    }
 
-inline int SocketStream::read(char* ptr, size_t size)
-{
-    return recv(sock_, ptr, size, 0);
-}
+    inline int SocketStream::read(char* ptr, size_t size)
+    {
+        return recv(sock_, ptr, size, 0);
+    }
 
-inline int SocketStream::write(const char* ptr, size_t size)
-{
-    return send(sock_, ptr, size, 0);
-}
+    inline int SocketStream::write(const char* ptr, size_t size)
+    {
+        return send(sock_, ptr, size, 0);
+    }
 
-inline int SocketStream::write(const char* ptr)
-{
-    return write(ptr, strlen(ptr));
-}
+    inline int SocketStream::write(const char* ptr)
+    {
+        return write(ptr, strlen(ptr));
+    }
+
+    inline std::string SocketStream::get_remote_addr() {
+        return detail::get_remote_addr(sock_);
+    }
 
 // HTTP server implementation
-inline Server::Server()
-    : svr_sock_(-1)
-{
-#if !defined(_MSC_VER) && !defined(WIN32)
-    signal(SIGPIPE, SIG_IGN);
+    inline Server::Server()
+            : keep_alive_max_count_(5)
+            , is_running_(false)
+            , svr_sock_(INVALID_SOCKET)
+            , running_threads_(0)
+    {
+#ifndef _WIN32
+        signal(SIGPIPE, SIG_IGN);
 #endif
-}
-
-inline Server::~Server()
-{
-}
-
-inline void Server::get(const char* pattern, Handler handler)
-{
-    get_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
-}
-
-inline void Server::post(const char* pattern, Handler handler)
-{
-    post_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
-}
-
-inline bool Server::set_base_dir(const char* path)
-{
-    if (detail::is_dir(path)) {
-        base_dir_ = path;
-        return true;
     }
-    return false;
-}
 
-inline void Server::set_error_handler(Handler handler)
-{
-    error_handler_ = handler;
-}
+    inline Server::~Server()
+    {
+    }
 
-inline void Server::set_logger(Logger logger)
-{
-    logger_ = logger;
-}
+    inline Server& Server::Get(const char* pattern, Handler handler)
+    {
+        get_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+        return *this;
+    }
 
-inline bool Server::listen(const char* host, int port, int socket_flags)
-{
-    svr_sock_ = detail::create_server_socket(host, port, socket_flags);
-    if (svr_sock_ == -1) {
+    inline Server& Server::Post(const char* pattern, Handler handler)
+    {
+        post_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+        return *this;
+    }
+
+    inline Server& Server::Put(const char* pattern, Handler handler)
+    {
+        put_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+        return *this;
+    }
+
+    inline Server& Server::Delete(const char* pattern, Handler handler)
+    {
+        delete_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+        return *this;
+    }
+
+    inline Server& Server::Options(const char* pattern, Handler handler)
+    {
+        options_handlers_.push_back(std::make_pair(std::regex(pattern), handler));
+        return *this;
+    }
+
+    inline bool Server::set_base_dir(const char* path)
+    {
+        if (detail::is_dir(path)) {
+            base_dir_ = path;
+            return true;
+        }
         return false;
     }
 
-    auto ret = true;
+    inline void Server::set_error_handler(Handler handler)
+    {
+        error_handler_ = handler;
+    }
 
-    for (;;) {
-        socket_t sock = accept(svr_sock_, NULL, NULL);
+    inline void Server::set_logger(Logger logger)
+    {
+        logger_ = logger;
+    }
 
-        if (sock == -1) {
-            if (svr_sock_ != -1) {
-                detail::close_socket(svr_sock_);
-                ret = false;
+    inline void Server::set_keep_alive_max_count(size_t count)
+    {
+        keep_alive_max_count_ = count;
+    }
+
+    inline int Server::bind_to_any_port(const char* host, int socket_flags)
+    {
+        return bind_internal(host, 0, socket_flags);
+    }
+
+    inline bool Server::listen_after_bind() {
+        return listen_internal();
+    }
+
+    inline bool Server::listen(const char* host, int port, int socket_flags)
+    {
+        if (bind_internal(host, port, socket_flags) < 0)
+            return false;
+        return listen_internal();
+    }
+
+    inline bool Server::is_running() const
+    {
+        return is_running_;
+    }
+
+    inline void Server::stop()
+    {
+        if (is_running_) {
+            assert(svr_sock_ != INVALID_SOCKET);
+            detail::shutdown_socket(svr_sock_);
+            detail::close_socket(svr_sock_);
+            svr_sock_ = INVALID_SOCKET;
+        }
+    }
+
+    inline bool Server::parse_request_line(const char* s, Request& req)
+    {
+        static std::regex re("(GET|HEAD|POST|PUT|DELETE|OPTIONS) (([^?]+)(?:\\?(.+?))?) (HTTP/1\\.[01])\r\n");
+
+        std::cmatch m;
+        if (std::regex_match(s, m, re)) {
+            req.version = std::string(m[4]);
+            req.method = std::string(m[1]);
+            req.target = std::string(m[2]);
+            req.path = detail::decode_url(m[3]);
+
+            // Parse query text
+            auto len = std::distance(m[4].first, m[4].second);
+            if (len > 0) {
+                detail::parse_query_text(m[4], req.params);
+            }
+
+            return true;
+        }
+
+        return false;
+    }
+
+    inline void Server::write_response(Stream& strm, bool last_connection, const Request& req, Response& res)
+    {
+        assert(res.status != -1);
+
+        if (400 <= res.status && error_handler_) {
+            error_handler_(req, res);
+        }
+
+        // Response line
+        strm.write_format("HTTP/1.1 %d %s\r\n",
+                          res.status,
+                          detail::status_message(res.status));
+
+        // Headers
+        if (last_connection ||
+            req.version == "HTTP/1.0" ||
+            req.get_header_value("Connection") == "close") {
+            res.set_header("Connection", "close");
+        }
+
+        if (!res.body.empty()) {
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+            // TODO: 'Accpet-Encoding' has gzip, not gzip;q=0
+        const auto& encodings = req.get_header_value("Accept-Encoding");
+        if (encodings.find("gzip") != std::string::npos &&
+            detail::can_compress(res.get_header_value("Content-Type"))) {
+            detail::compress(res.body);
+            res.set_header("Content-Encoding", "gzip");
+        }
+#endif
+
+            if (!res.has_header("Content-Type")) {
+                res.set_header("Content-Type", "text/plain");
+            }
+
+            auto length = std::to_string(res.body.size());
+            res.set_header("Content-Length", length.c_str());
+        }
+
+        detail::write_headers(strm, res);
+
+        // Body
+        if (!res.body.empty() && req.method != "HEAD") {
+            strm.write(res.body.c_str(), res.body.size());
+        }
+
+        // Log
+        if (logger_) {
+            logger_(req, res);
+        }
+    }
+
+    inline bool Server::handle_file_request(Request& req, Response& res)
+    {
+        if (!base_dir_.empty() && detail::is_valid_path(req.path)) {
+            std::string path = base_dir_ + req.path;
+
+            if (!path.empty() && path.back() == '/') {
+                path += "index.html";
+            }
+
+            if (detail::is_file(path)) {
+                detail::read_file(path, res.body);
+                auto type = detail::find_content_type(path);
+                if (type) {
+                    res.set_header("Content-Type", type);
+                }
+                res.status = 200;
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    inline socket_t Server::create_server_socket(const char* host, int port, int socket_flags) const
+    {
+        return detail::create_socket(host, port,
+                                     [](socket_t sock, struct addrinfo& ai) -> bool {
+                                         if (::bind(sock, ai.ai_addr, ai.ai_addrlen)) {
+                                             return false;
+                                         }
+                                         if (::listen(sock, 5)) { // Listen through 5 channels
+                                             return false;
+                                         }
+                                         return true;
+                                     }, socket_flags);
+    }
+
+    inline int Server::bind_internal(const char* host, int port, int socket_flags)
+    {
+        if (!is_valid()) {
+            return -1;
+        }
+
+        svr_sock_ = create_server_socket(host, port, socket_flags);
+        if (svr_sock_ == INVALID_SOCKET) {
+            return -1;
+        }
+
+        if (port == 0) {
+            struct sockaddr_storage address;
+            socklen_t len = sizeof(address);
+            if (getsockname(svr_sock_, reinterpret_cast<struct sockaddr *>(&address), &len) == -1) {
+                return -1;
+            }
+            if (address.ss_family == AF_INET) {
+                return ntohs(reinterpret_cast<struct sockaddr_in*>(&address)->sin_port);
+            } else if (address.ss_family == AF_INET6) {
+                return ntohs(reinterpret_cast<struct sockaddr_in6*>(&address)->sin6_port);
             } else {
-                ; // The server socket was closed by user.
+                return -1;
             }
-            break;
+        } else {
+            return port;
         }
-
-        // TODO: should be async
-        read_and_close_socket(sock);
     }
 
-    return ret;
-}
+    inline bool Server::listen_internal()
+    {
+        auto ret = true;
 
-inline void Server::stop()
-{
-    detail::shutdown_socket(svr_sock_);
-    detail::close_socket(svr_sock_);
-    svr_sock_ = -1;
-}
+        is_running_ = true;
 
-inline bool Server::read_request_line(Stream& strm, Request& req)
-{
-    const auto BUFSIZ_REQUESTLINE = 2048;
-    char buf[BUFSIZ_REQUESTLINE];
-    if (!detail::socket_gets(strm, buf, BUFSIZ_REQUESTLINE)) {
-        return false;
-    }
+        for (;;) {
+            auto val = detail::select_read(svr_sock_, 0, 100000);
 
-    static std::regex re("(GET|HEAD|POST) ([^?]+)(?:\\?(.+?))? HTTP/1\\.[01]\r\n");
-
-    std::cmatch m;
-    if (std::regex_match(buf, m, re)) {
-        req.method = std::string(m[1]);
-        req.path = detail::decode_url(m[2]);
-
-        // Parse query text
-        auto len = std::distance(m[3].first, m[3].second);
-        if (len > 0) {
-            detail::parse_query_text(m[3], req.params);
-        }
-
-        return true;
-    }
-
-    return false;
-}
-
-inline bool Server::handle_file_request(Request& req, Response& res)
-{
-    if (!base_dir_.empty()) {
-        std::string path = base_dir_ + req.path;
-
-        if (!path.empty() && path.back() == '/') {
-            path += "index.html";
-        }
-
-        if (detail::is_file(path)) {
-            detail::read_file(path, res.body);
-            auto type = detail::content_type(path);
-            if (type) {
-                res.set_header("Content-Type", type);
+            if (val == 0) { // Timeout
+                if (svr_sock_ == INVALID_SOCKET) {
+                    // The server socket was closed by 'stop' method.
+                    break;
+                }
+                continue;
             }
-            res.status = 200;
+
+            socket_t sock = accept(svr_sock_, NULL, NULL);
+
+            if (sock == INVALID_SOCKET) {
+                if (svr_sock_ != INVALID_SOCKET) {
+                    detail::close_socket(svr_sock_);
+                    ret = false;
+                } else {
+                    ; // The server socket was closed by user.
+                }
+                break;
+            }
+
+            // TODO: Use thread pool...
+            std::thread([=]() {
+                {
+                    std::lock_guard<std::mutex> guard(running_threads_mutex_);
+                    running_threads_++;
+                }
+
+                read_and_close_socket(sock);
+
+                {
+                    std::lock_guard<std::mutex> guard(running_threads_mutex_);
+                    running_threads_--;
+                }
+            }).detach();
+        }
+
+        // TODO: Use thread pool...
+        for (;;) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            std::lock_guard<std::mutex> guard(running_threads_mutex_);
+            if (!running_threads_) {
+                break;
+            }
+        }
+
+        is_running_ = false;
+
+        return ret;
+    }
+
+    inline bool Server::routing(Request& req, Response& res)
+    {
+        if (req.method == "GET" && handle_file_request(req, res)) {
             return true;
         }
-    }
 
-    return false;
-}
-
-inline bool Server::routing(Request& req, Response& res)
-{
-    if (req.method == "GET" && handle_file_request(req, res)) {
-        return true;
-    }
-
-    if (req.method == "GET" || req.method == "HEAD") {
-        return dispatch_request(req, res, get_handlers_);
-    } else if (req.method == "POST") {
-        return dispatch_request(req, res, post_handlers_);
-    }
-    return false;
-}
-
-inline bool Server::dispatch_request(Request& req, Response& res, Handlers& handlers)
-{
-    for (const auto& x: handlers) {
-        const auto& pattern = x.first;
-        const auto& handler = x.second;
-
-        if (std::regex_match(req.path, req.matches, pattern)) {
-            handler(req, res);
-            return true;
+        if (req.method == "GET" || req.method == "HEAD") {
+            return dispatch_request(req, res, get_handlers_);
+        } else if (req.method == "POST") {
+            return dispatch_request(req, res, post_handlers_);
+        } else if (req.method == "PUT") {
+            return dispatch_request(req, res, put_handlers_);
+        } else if (req.method == "DELETE") {
+            return dispatch_request(req, res, delete_handlers_);
+        } else if (req.method == "OPTIONS") {
+            return dispatch_request(req, res, options_handlers_);
         }
-    }
-    return false;
-}
-
-inline void Server::process_request(Stream& strm)
-{
-    Request req;
-    Response res;
-
-    if (!read_request_line(strm, req) ||
-        !detail::read_headers(strm, req.headers)) {
-        // TODO:
-        return;
-    }
-
-    if (req.method == "POST") {
-        if (!detail::read_content(strm, req, false)) {
-            // TODO:
-            return;
-        }
-        static std::string type = "application/x-www-form-urlencoded";
-        if (!req.get_header_value("Content-Type").compare(0, type.size(), type)) {
-            detail::parse_query_text(req.body, req.params);
-        }
-    }
-
-    if (routing(req, res)) {
-        if (res.status == -1) {
-            res.status = 200;
-        }
-    } else {
-        res.status = 404;
-    }
-    assert(res.status != -1);
-
-    if (400 <= res.status && error_handler_) {
-        error_handler_(req, res);
-    }
-
-    detail::write_response(strm, req, res);
-
-    if (logger_) {
-        logger_(req, res);
-    }
-}
-
-inline bool Server::read_and_close_socket(socket_t sock)
-{
-    return detail::read_and_close_socket(sock, [this](Stream& strm) {
-        process_request(strm);
-        return true;
-    });
-}
-
-// HTTP client implementation
-inline Client::Client(const char* host, int port)
-    : host_(host)
-    , port_(port)
-    , host_and_port_(host_ + ":" + std::to_string(port_))
-{
-}
-
-inline Client::~Client()
-{
-}
-
-inline bool Client::read_response_line(Stream& strm, Response& res)
-{
-    const auto BUFSIZ_RESPONSELINE = 2048;
-    char buf[BUFSIZ_RESPONSELINE];
-    if (!detail::socket_gets(strm, buf, BUFSIZ_RESPONSELINE)) {
         return false;
     }
 
-    const static std::regex re("HTTP/1\\.[01] (\\d+?) .+\r\n");
+    inline bool Server::dispatch_request(Request& req, Response& res, Handlers& handlers)
+    {
+        for (const auto& x: handlers) {
+            const auto& pattern = x.first;
+            const auto& handler = x.second;
 
-    std::cmatch m;
-    if (std::regex_match(buf, m, re)) {
-        res.status = std::stoi(std::string(m[1]));
-    }
-
-    return true;
-}
-
-inline bool Client::send(const Request& req, Response& res)
-{
-    auto sock = detail::create_client_socket(host_.c_str(), port_);
-    if (sock == -1) {
+            if (std::regex_match(req.path, req.matches, pattern)) {
+                handler(req, res);
+                return true;
+            }
+        }
         return false;
     }
 
-    return read_and_close_socket(sock, req, res);
-}
+    inline bool Server::process_request(Stream& strm, bool last_connection, bool& connection_close)
+    {
+        const auto bufsiz = 2048;
+        char buf[bufsiz];
 
-inline bool Client::process_request(Stream& strm, const Request& req, Response& res)
-{
-    // Send request
-    detail::write_request(strm, req);
+        detail::stream_line_reader reader(strm, buf, bufsiz);
 
-    // Receive response
-    if (!read_response_line(strm, res) ||
-        !detail::read_headers(strm, res.headers)) {
-        return false;
-    }
-    if (req.method != "HEAD") {
-        if (!detail::read_content(strm, res, true)) {
+        // Connection has been closed on client
+        if (!reader.getline()) {
             return false;
         }
-    }
 
-    return true;
-}
+        Request req;
+        Response res;
 
-inline bool Client::read_and_close_socket(socket_t sock, const Request& req, Response& res)
-{
-    return detail::read_and_close_socket(sock, [&](Stream& strm) {
-        return process_request(strm, req, res);
-    });
-}
+        res.version = "HTTP/1.1";
 
-inline void Client::add_default_headers(Request& req)
-{
-    req.set_header("Host", host_and_port_.c_str());
-    req.set_header("Accept", "*/*");
-    req.set_header("User-Agent", "cpp-httplib/0.1");
-}
-
-inline std::shared_ptr<Response> Client::get(const char* path)
-{
-    Request req;
-    req.method = "GET";
-    req.path = path;
-    add_default_headers(req);
-
-    auto res = std::make_shared<Response>();
-
-    return send(req, *res) ? res : nullptr;
-}
-
-inline std::shared_ptr<Response> Client::head(const char* path)
-{
-    Request req;
-    req.method = "HEAD";
-    req.path = path;
-    add_default_headers(req);
-
-    auto res = std::make_shared<Response>();
-
-    return send(req, *res) ? res : nullptr;
-}
-
-inline std::shared_ptr<Response> Client::post(
-    const char* path, const std::string& body, const char* content_type)
-{
-    Request req;
-    req.method = "POST";
-    req.path = path;
-    add_default_headers(req);
-
-    req.set_header("Content-Type", content_type);
-    req.body = body;
-
-    auto res = std::make_shared<Response>();
-
-    return send(req, *res) ? res : nullptr;
-}
-
-inline std::shared_ptr<Response> Client::post(
-    const char* path, const Map& params)
-{
-    std::string query;
-    for (auto it = params.begin(); it != params.end(); ++it) {
-        if (it != params.begin()) {
-            query += "&";
+        // Request line and headers
+        if (!parse_request_line(reader.ptr(), req) || !detail::read_headers(strm, req.headers)) {
+            res.status = 400;
+            write_response(strm, last_connection, req, res);
+            return true;
         }
-        query += it->first;
-        query += "=";
-        query += it->second;
+
+        auto ret = true;
+        if (req.get_header_value("Connection") == "close") {
+            // ret = false;
+            connection_close = true;
+        }
+
+        req.set_header("REMOTE_ADDR", strm.get_remote_addr().c_str());
+
+        // Body
+        if (req.method == "POST" || req.method == "PUT") {
+            if (!detail::read_content(strm, req)) {
+                res.status = 400;
+                write_response(strm, last_connection, req, res);
+                return ret;
+            }
+
+            const auto& content_type = req.get_header_value("Content-Type");
+
+            if (req.get_header_value("Content-Encoding") == "gzip") {
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+                detail::decompress(req.body);
+#else
+                res.status = 415;
+                write_response(strm, last_connection, req, res);
+                return ret;
+#endif
+            }
+
+            if (!content_type.find("application/x-www-form-urlencoded")) {
+                detail::parse_query_text(req.body, req.params);
+            } else if(!content_type.find("multipart/form-data")) {
+                std::string boundary;
+                if (!detail::parse_multipart_boundary(content_type, boundary) ||
+                    !detail::parse_multipart_formdata(boundary, req.body, req.files)) {
+                    res.status = 400;
+                    write_response(strm, last_connection, req, res);
+                    return ret;
+                }
+            }
+        }
+
+        if (routing(req, res)) {
+            if (res.status == -1) {
+                res.status = 200;
+            }
+        } else {
+            res.status = 404;
+        }
+
+        write_response(strm, last_connection, req, res);
+        return ret;
     }
 
-    return post(path, query, "application/x-www-form-urlencoded");
-}
+    inline bool Server::is_valid() const
+    {
+        return true;
+    }
+
+    inline bool Server::read_and_close_socket(socket_t sock)
+    {
+        return detail::read_and_close_socket(
+                sock,
+                keep_alive_max_count_,
+                [this](Stream& strm, bool last_connection, bool& connection_close) {
+                    return process_request(strm, last_connection, connection_close);
+                });
+    }
+
+// HTTP client implementation
+    inline Client::Client(
+            const char* host, int port, size_t timeout_sec)
+            : host_(host)
+            , port_(port)
+            , timeout_sec_(timeout_sec)
+            , host_and_port_(host_ + ":" + std::to_string(port_))
+    {
+    }
+
+    inline Client::~Client()
+    {
+    }
+
+    inline bool Client::is_valid() const
+    {
+        return true;
+    }
+
+    inline socket_t Client::create_client_socket() const
+    {
+        return detail::create_socket(host_.c_str(), port_,
+                                     [=](socket_t sock, struct addrinfo& ai) -> bool {
+                                         detail::set_nonblocking(sock, true);
+
+                                         auto ret = connect(sock, ai.ai_addr, ai.ai_addrlen);
+                                         if (ret < 0) {
+                                             if (detail::is_connection_error() ||
+                                                 !detail::wait_until_socket_is_ready(sock, timeout_sec_, 0)) {
+                                                 detail::close_socket(sock);
+                                                 return false;
+                                             }
+                                         }
+
+                                         detail::set_nonblocking(sock, false);
+                                         return true;
+                                     });
+    }
+
+    inline bool Client::read_response_line(Stream& strm, Response& res)
+    {
+        const auto bufsiz = 2048;
+        char buf[bufsiz];
+
+        detail::stream_line_reader reader(strm, buf, bufsiz);
+
+        if (!reader.getline()) {
+            return false;
+        }
+
+        const static std::regex re("(HTTP/1\\.[01]) (\\d+?) .+\r\n");
+
+        std::cmatch m;
+        if (std::regex_match(reader.ptr(), m, re)) {
+            res.version = std::string(m[1]);
+            res.status = std::stoi(std::string(m[2]));
+        }
+
+        return true;
+    }
+
+    inline bool Client::send(Request& req, Response& res)
+    {
+        if (req.path.empty()) {
+            return false;
+        }
+
+        auto sock = create_client_socket();
+        if (sock == INVALID_SOCKET) {
+            return false;
+        }
+
+        return read_and_close_socket(sock, req, res);
+    }
+
+    inline void Client::write_request(Stream& strm, Request& req)
+    {
+        auto path = detail::encode_url(req.path);
+
+        // Request line
+        strm.write_format("%s %s HTTP/1.1\r\n",
+                          req.method.c_str(),
+                          path.c_str());
+
+        // Headers
+        req.set_header("Host", host_and_port_.c_str());
+
+        if (!req.has_header("Accept")) {
+            req.set_header("Accept", "*/*");
+        }
+
+        if (!req.has_header("User-Agent")) {
+            req.set_header("User-Agent", "cpp-httplib/0.2");
+        }
+
+        // TODO: Support KeepAlive connection
+        // if (!req.has_header("Connection")) {
+        req.set_header("Connection", "close");
+        // }
+
+        if (!req.body.empty()) {
+            if (!req.has_header("Content-Type")) {
+                req.set_header("Content-Type", "text/plain");
+            }
+
+            auto length = std::to_string(req.body.size());
+            req.set_header("Content-Length", length.c_str());
+        }
+
+        detail::write_headers(strm, req);
+
+        // Body
+        if (!req.body.empty()) {
+            if (req.get_header_value("Content-Type") == "application/x-www-form-urlencoded") {
+                auto str = detail::encode_url(req.body);
+                strm.write(str.c_str(), str.size());
+            } else {
+                strm.write(req.body.c_str(), req.body.size());
+            }
+        }
+    }
+
+    inline bool Client::process_request(Stream& strm, Request& req, Response& res, bool& connection_close)
+    {
+        // Send request
+        write_request(strm, req);
+
+        // Receive response and headers
+        if (!read_response_line(strm, res) || !detail::read_headers(strm, res.headers)) {
+            return false;
+        }
+
+        if (res.get_header_value("Connection") == "close" || res.version == "HTTP/1.0") {
+            connection_close = true;
+        }
+
+        // Body
+        if (req.method != "HEAD") {
+            if (!detail::read_content(strm, res, req.progress)) {
+                return false;
+            }
+
+            if (res.get_header_value("Content-Encoding") == "gzip") {
+#ifdef CPPHTTPLIB_ZLIB_SUPPORT
+                detail::decompress(res.body);
+#else
+                return false;
+#endif
+            }
+        }
+
+        return true;
+    }
+
+    inline bool Client::read_and_close_socket(socket_t sock, Request& req, Response& res)
+    {
+        return detail::read_and_close_socket(
+                sock,
+                0,
+                [&](Stream& strm, bool /*last_connection*/, bool& connection_close) {
+                    return process_request(strm, req, res, connection_close);
+                });
+    }
+
+    inline std::shared_ptr<Response> Client::Get(const char* path, Progress progress)
+    {
+        return Get(path, Headers(), progress);
+    }
+
+    inline std::shared_ptr<Response> Client::Get(const char* path, const Headers& headers, Progress progress)
+    {
+        Request req;
+        req.method = "GET";
+        req.path = path;
+        req.headers = headers;
+        req.progress = progress;
+
+        auto res = std::make_shared<Response>();
+
+        return send(req, *res) ? res : nullptr;
+    }
+
+    inline std::shared_ptr<Response> Client::Head(const char* path)
+    {
+        return Head(path, Headers());
+    }
+
+    inline std::shared_ptr<Response> Client::Head(const char* path, const Headers& headers)
+    {
+        Request req;
+        req.method = "HEAD";
+        req.headers = headers;
+        req.path = path;
+
+        auto res = std::make_shared<Response>();
+
+        return send(req, *res) ? res : nullptr;
+    }
+
+    inline std::shared_ptr<Response> Client::Post(
+            const char* path, const std::string& body, const char* content_type)
+    {
+        return Post(path, Headers(), body, content_type);
+    }
+
+    inline std::shared_ptr<Response> Client::Post(
+            const char* path, const Headers& headers, const std::string& body, const char* content_type)
+    {
+        Request req;
+        req.method = "POST";
+        req.headers = headers;
+        req.path = path;
+
+        req.headers.emplace("Content-Type", content_type);
+        req.body = body;
+
+        auto res = std::make_shared<Response>();
+
+        return send(req, *res) ? res : nullptr;
+    }
+
+    inline std::shared_ptr<Response> Client::Post(const char* path, const Params& params)
+    {
+        return Post(path, Headers(), params);
+    }
+
+    inline std::shared_ptr<Response> Client::Post(const char* path, const Headers& headers, const Params& params)
+    {
+        std::string query;
+        for (auto it = params.begin(); it != params.end(); ++it) {
+            if (it != params.begin()) {
+                query += "&";
+            }
+            query += it->first;
+            query += "=";
+            query += it->second;
+        }
+
+        return Post(path, headers, query, "application/x-www-form-urlencoded");
+    }
+
+    inline std::shared_ptr<Response> Client::Put(
+            const char* path, const std::string& body, const char* content_type)
+    {
+        return Put(path, Headers(), body, content_type);
+    }
+
+    inline std::shared_ptr<Response> Client::Put(
+            const char* path, const Headers& headers, const std::string& body, const char* content_type)
+    {
+        Request req;
+        req.method = "PUT";
+        req.headers = headers;
+        req.path = path;
+
+        req.headers.emplace("Content-Type", content_type);
+        req.body = body;
+
+        auto res = std::make_shared<Response>();
+
+        return send(req, *res) ? res : nullptr;
+    }
+
+    inline std::shared_ptr<Response> Client::Delete(const char* path)
+    {
+        return Delete(path, Headers());
+    }
+
+    inline std::shared_ptr<Response> Client::Delete(const char* path, const Headers& headers)
+    {
+        Request req;
+        req.method = "DELETE";
+        req.path = path;
+        req.headers = headers;
+
+        auto res = std::make_shared<Response>();
+
+        return send(req, *res) ? res : nullptr;
+    }
+
+    inline std::shared_ptr<Response> Client::Options(const char* path)
+    {
+        return Options(path, Headers());
+    }
+
+    inline std::shared_ptr<Response> Client::Options(const char* path, const Headers& headers)
+    {
+        Request req;
+        req.method = "OPTIONS";
+        req.path = path;
+        req.headers = headers;
+
+        auto res = std::make_shared<Response>();
+
+        return send(req, *res) ? res : nullptr;
+    }
 
 /*
  * SSL Implementation
  */
 #ifdef CPPHTTPLIB_OPENSSL_SUPPORT
-namespace detail {
+    namespace detail {
 
-template <typename U, typename V, typename T>
-inline bool read_and_close_socket_ssl(socket_t sock, SSL_CTX* ctx, U SSL_connect_or_accept, V setup, T callback)
-{
-    auto ssl = SSL_new(ctx);
+        template <typename U, typename V, typename T>
+        inline bool read_and_close_socket_ssl(
+                socket_t sock, size_t keep_alive_max_count,
+                // TODO: OpenSSL 1.0.2 occasionally crashes...
+                // The upcoming 1.1.0 is going to be thread safe.
+                SSL_CTX* ctx, std::mutex& ctx_mutex,
+                U SSL_connect_or_accept, V setup,
+                T callback)
+        {
+            SSL* ssl = nullptr;
+            {
+                std::lock_guard<std::mutex> guard(ctx_mutex);
 
-    auto bio = BIO_new_socket(sock, BIO_NOCLOSE);
-    SSL_set_bio(ssl, bio, bio);
+                ssl = SSL_new(ctx);
+                if (!ssl) {
+                    return false;
+                }
+            }
 
-    setup(ssl);
+            auto bio = BIO_new_socket(sock, BIO_NOCLOSE);
+            SSL_set_bio(ssl, bio, bio);
 
-    SSL_connect_or_accept(ssl);
+            setup(ssl);
 
-    SSLSocketStream strm(ssl);
-    auto ret = callback(strm);
+            SSL_connect_or_accept(ssl);
 
-    SSL_shutdown(ssl);
-    SSL_free(ssl);
-    close_socket(sock);
-    return ret;
-}
+            bool ret = false;
 
-class SSLInit {
-public:
-    SSLInit() {
-        SSL_load_error_strings();
-        SSL_library_init();
-    }
-};
+            if (keep_alive_max_count > 0) {
+                auto count = keep_alive_max_count;
+                while (count > 0 &&
+                       detail::select_read(sock,
+                                           CPPHTTPLIB_KEEPALIVE_TIMEOUT_SECOND,
+                                           CPPHTTPLIB_KEEPALIVE_TIMEOUT_USECOND) > 0) {
+                    SSLSocketStream strm(sock, ssl);
+                    auto last_connection = count == 1;
+                    auto connection_close = false;
 
-static SSLInit sslinit_;
+                    ret = callback(strm, last_connection, connection_close);
+                    if (!ret || connection_close) {
+                        break;
+                    }
 
-} // namespace detail
+                    count--;
+                }
+            } else {
+                SSLSocketStream strm(sock, ssl);
+                auto dummy_connection_close = false;
+                ret = callback(strm, true, dummy_connection_close);
+            }
+
+            SSL_shutdown(ssl);
+
+            {
+                std::lock_guard<std::mutex> guard(ctx_mutex);
+                SSL_free(ssl);
+            }
+
+            close_socket(sock);
+
+            return ret;
+        }
+
+        class SSLInit {
+        public:
+            SSLInit() {
+                SSL_load_error_strings();
+                SSL_library_init();
+            }
+        };
+
+        static SSLInit sslinit_;
+
+    } // namespace detail
 
 // SSL socket stream implementation
-inline SSLSocketStream::SSLSocketStream(SSL* ssl): ssl_(ssl)
-{
-}
+    inline SSLSocketStream::SSLSocketStream(socket_t sock, SSL* ssl)
+            : sock_(sock), ssl_(ssl)
+    {
+    }
 
-inline SSLSocketStream::~SSLSocketStream()
-{
-}
+    inline SSLSocketStream::~SSLSocketStream()
+    {
+    }
 
-inline int SSLSocketStream::read(char* ptr, size_t size)
-{
-    return SSL_read(ssl_, ptr, size);
-}
+    inline int SSLSocketStream::read(char* ptr, size_t size)
+    {
+        return SSL_read(ssl_, ptr, size);
+    }
 
-inline int SSLSocketStream::write(const char* ptr, size_t size)
-{
-    return SSL_write(ssl_, ptr, size);
-}
+    inline int SSLSocketStream::write(const char* ptr, size_t size)
+    {
+        return SSL_write(ssl_, ptr, size);
+    }
 
-inline int SSLSocketStream::write(const char* ptr)
-{
-    return write(ptr, strlen(ptr));
-}
+    inline int SSLSocketStream::write(const char* ptr)
+    {
+        return write(ptr, strlen(ptr));
+    }
+
+    inline std::string SSLSocketStream::get_remote_addr() {
+        return detail::get_remote_addr(sock_);
+    }
 
 // SSL HTTP server implementation
-inline SSLServer::SSLServer(const char* cert_path, const char* private_key_path)
-{
-    ctx_ = SSL_CTX_new(SSLv23_server_method());
+    inline SSLServer::SSLServer(const char* cert_path, const char* private_key_path)
+    {
+        ctx_ = SSL_CTX_new(SSLv23_server_method());
 
-    if (ctx_) {
-        SSL_CTX_set_options(ctx_,
-                            SSL_OP_ALL | SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
-                            SSL_OP_NO_COMPRESSION |
-                            SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
+        if (ctx_) {
+            SSL_CTX_set_options(ctx_,
+                                SSL_OP_ALL | SSL_OP_NO_SSLv2 | SSL_OP_NO_SSLv3 |
+                                SSL_OP_NO_COMPRESSION |
+                                SSL_OP_NO_SESSION_RESUMPTION_ON_RENEGOTIATION);
 
-        // auto ecdh = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1);
-        // SSL_CTX_set_tmp_ecdh(ctx_, ecdh);
-        // EC_KEY_free(ecdh);
+            // auto ecdh = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1);
+            // SSL_CTX_set_tmp_ecdh(ctx_, ecdh);
+            // EC_KEY_free(ecdh);
 
-        if (SSL_CTX_use_certificate_file(ctx_, cert_path, SSL_FILETYPE_PEM) != 1 ||
-            SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) != 1) {
-            SSL_CTX_free(ctx_);
-            ctx_ = nullptr;
+            if (SSL_CTX_use_certificate_file(ctx_, cert_path, SSL_FILETYPE_PEM) != 1 ||
+                SSL_CTX_use_PrivateKey_file(ctx_, private_key_path, SSL_FILETYPE_PEM) != 1) {
+                SSL_CTX_free(ctx_);
+                ctx_ = nullptr;
+            }
         }
     }
-}
 
-inline SSLServer::~SSLServer()
-{
-    if (ctx_) {
-        SSL_CTX_free(ctx_);
+    inline SSLServer::~SSLServer()
+    {
+        if (ctx_) {
+            SSL_CTX_free(ctx_);
+        }
     }
-}
 
-inline bool SSLServer::read_and_close_socket(socket_t sock)
-{
-    return detail::read_and_close_socket_ssl(
-        sock, ctx_,
-        SSL_accept,
-        [](SSL* ssl) {},
-        [this](Stream& strm) {
-            process_request(strm);
-            return true;
-        });
-}
-
-// SSL HTTP client implementation
-inline SSLClient::SSLClient(const char* host, int port)
-    : Client(host, port)
-{
-    ctx_ = SSL_CTX_new(SSLv23_client_method());
-}
-
-inline SSLClient::~SSLClient()
-{
-    if (ctx_) {
-        SSL_CTX_free(ctx_);
+    inline bool SSLServer::is_valid() const
+    {
+        return ctx_;
     }
-}
 
-inline bool SSLClient::read_and_close_socket(socket_t sock, const Request& req, Response& res)
-{
-    return detail::read_and_close_socket_ssl(
-        sock, ctx_,
-        SSL_connect,
-        [&](SSL* ssl) {
-            SSL_set_tlsext_host_name(ssl, host_.c_str());
-        },
-        [&](Stream& strm) {
-            return process_request(strm, req, res);
-        });
-}
+    inline bool SSLServer::read_and_close_socket(socket_t sock)
+    {
+        return detail::read_and_close_socket_ssl(
+                sock,
+                keep_alive_max_count_,
+                ctx_, ctx_mutex_,
+                SSL_accept,
+                [](SSL* /*ssl*/) {},
+                [this](Stream& strm, bool last_connection, bool& connection_close) {
+                    return process_request(strm, last_connection, connection_close);
+                });
+    }
+
+    // SSL HTTP client implementation
+    inline SSLClient::SSLClient(const char* host, int port, size_t timeout_sec)
+            : Client(host, port, timeout_sec)
+    {
+        ctx_ = SSL_CTX_new(SSLv23_client_method());
+    }
+
+    inline SSLClient::~SSLClient()
+    {
+        if (ctx_) {
+            SSL_CTX_free(ctx_);
+        }
+    }
+
+    inline bool SSLClient::is_valid() const
+    {
+        return ctx_;
+    }
+
+    inline bool SSLClient::read_and_close_socket(socket_t sock, Request& req, Response& res)
+    {
+        return is_valid() && detail::read_and_close_socket_ssl(
+                sock, 0,
+                ctx_, ctx_mutex_,
+                SSL_connect,
+                [&](SSL* ssl) {
+                    SSL_set_tlsext_host_name(ssl, host_.c_str());
+                },
+                [&](Stream& strm, bool /*last_connection*/, bool& connection_close) {
+                    return process_request(strm, req, res, connection_close);
+                });
+    }
 #endif
 
 } // namespace httplib
 
 #endif
 
-// vim: et ts=4 sw=4 cin cino={1s ff=unix
+// vim: et ts=4 sw=4 cin cino={1s ff=unix
\ No newline at end of file
diff --git a/src/App.cpp b/src/App.cpp
index c004b180..d3330dc2 100644
--- a/src/App.cpp
+++ b/src/App.cpp
@@ -113,7 +113,6 @@ App::~App()
     delete m_network;
 
     Options::release();
-    Mem::release();
     Platform::release();
 
     uv_tty_reset_mode();
@@ -142,12 +141,26 @@ int App::start()
 
     background();
 
-    if (!CryptoNight::init(m_options->algo(), m_options->aesni())) {
-        LOG_ERR("\"%s\" hash self-test failed.", m_options->algoName());
-        return EINVAL;
+    if (Options::i()->colors()) {
+        LOG_INFO(WHITE_BOLD("%s hash self-test"), m_options->algoName());
+    }
+    else {
+        LOG_INFO("%s hash self-test", m_options->algoName());
     }
 
-    Mem::allocate(m_options);
+    if (!CryptoNight::init(m_options->algo(), m_options->aesni())) {
+        LOG_ERR("%s hash self-test... failed.", m_options->algoName());
+        return EINVAL;
+    } else {
+        if (Options::i()->colors()) {
+            LOG_INFO(WHITE_BOLD("%s hash self-test... ") GREEN_BOLD("successful") ".", m_options->algoName());
+        }
+        else {
+            LOG_INFO("%s hash self-test... successful.", m_options->algoName());
+        }
+    }
+
+    Mem::init(m_options);
 
     Summary::print();
 
@@ -174,7 +187,7 @@ int App::start()
     }
 #   endif
 
-    Workers::start(m_options->affinity(), m_options->priority());
+    Workers::start(m_options->threads(), m_options->affinity(), m_options->priority());
 
     if (m_options->pools().front()->isValid()) {
         m_network->connect();
diff --git a/src/App_unix.cpp b/src/App_unix.cpp
index df90eb26..8511597c 100644
--- a/src/App_unix.cpp
+++ b/src/App_unix.cpp
@@ -31,10 +31,6 @@
 
 void App::background()
 {
-    if (m_options->affinity() != -1L) {
-        Cpu::setAffinity(-1, m_options->affinity());
-    }
-
     if (m_options->background()) {
         Log::i()->text(Options::i()->colors()
                        ? "\x1B[01;31m\nBackground mode is not supported by %s on *nix Systems. Please use screen/tmux or systemd service instead.\n"
diff --git a/src/App_win.cpp b/src/App_win.cpp
index 895f3bdf..6b2716f0 100644
--- a/src/App_win.cpp
+++ b/src/App_win.cpp
@@ -33,10 +33,6 @@
 
 void App::background()
 {
-    if (m_options->affinity() != -1L) {
-        Cpu::setAffinity(-1, m_options->affinity());
-    }
-
     if (!m_options->background()) {
         return;
     }
diff --git a/src/Cpu.cpp b/src/Cpu.cpp
index ddd4642d..73fcdfb4 100644
--- a/src/Cpu.cpp
+++ b/src/Cpu.cpp
@@ -145,9 +145,9 @@ void Cpu::optimizeParameters(size_t& threadsCount, size_t& hashFactor, Options::
     CpuImpl::instance().optimizeParameters(threadsCount, hashFactor, algo, maxCpuUsage, safeMode);
 }
 
-void Cpu::setAffinity(int id, uint64_t mask)
+int Cpu::setThreadAffinity(size_t threadId, int64_t affinityMask)
 {
-    CpuImpl::instance().setAffinity(id, mask);
+    return CpuImpl::instance().setThreadAffinity(threadId, affinityMask);
 }
 
 bool Cpu::hasAES()
@@ -194,3 +194,24 @@ size_t Cpu::availableCache()
 {
     return CpuImpl::instance().availableCache();
 }
+
+int Cpu::getAssignedCpuId(size_t threadId, int64_t affinityMask)
+{
+    int cpuId = -1;
+
+    Mem::ThreadBitSet threadAffinityMask = Mem::ThreadBitSet(affinityMask);
+    size_t threadCount = 0;
+
+    for (size_t i = 0; i < CpuImpl::instance().threads(); i++) {
+        if (threadAffinityMask.test(i)) {
+            if (threadCount == threadId) {
+                cpuId = i;
+                break;
+            }
+
+            threadCount++;
+        }
+    }
+
+    return cpuId;
+}
diff --git a/src/Cpu.h b/src/Cpu.h
index 90bf3e18..a9161d67 100644
--- a/src/Cpu.h
+++ b/src/Cpu.h
@@ -42,7 +42,7 @@ public:
     static void optimizeParameters(size_t& threadsCount, size_t& hashFactor, Options::Algo algo,
                                     size_t maxCpuUsage, bool safeMode);
 
-    static void setAffinity(int id, uint64_t mask);
+    static int setThreadAffinity(size_t threadId, int64_t affinityMask);
 
     static bool hasAES();
     static bool isX64();
@@ -53,6 +53,7 @@ public:
     static size_t sockets();
     static size_t threads();
     static size_t availableCache();
+    static int getAssignedCpuId(size_t threadId, int64_t affinityMask);
 };
 
 
diff --git a/src/CpuImpl.h b/src/CpuImpl.h
index 96d3ad4e..b2bec265 100644
--- a/src/CpuImpl.h
+++ b/src/CpuImpl.h
@@ -29,6 +29,7 @@
 #include <vector>
 
 #include "Options.h"
+#include "Mem.h"
 
 class CpuImpl
 {
@@ -39,7 +40,7 @@ public:
 
     void optimizeParameters(size_t& threadsCount, size_t& hashFactor, Options::Algo algo,
                             size_t maxCpuUsage, bool safeMode);
-    void setAffinity(int id, uint64_t mask);
+    int setThreadAffinity(size_t threadId, int64_t affinityMask);
 
     bool hasAES();
     bool isX64();
diff --git a/src/Cpu_mac.cpp b/src/Cpu_mac.cpp
index f82a8924..6bb787e1 100644
--- a/src/Cpu_mac.cpp
+++ b/src/Cpu_mac.cpp
@@ -22,13 +22,15 @@
  */
 
 
+#include <mach/thread_act.h>
+#include <mach/thread_policy.h>
 #include <pthread.h>
 #include <sched.h>
 #include <unistd.h>
 
 
-#include "Cpu.h"
 #include "CpuImpl.h"
+#include "Cpu.h"
 
 void CpuImpl::init()
 {
@@ -39,7 +41,23 @@ void CpuImpl::init()
     initCommon();
 }
 
-
-void CpuImpl::setAffinity(int id, uint64_t mask)
+int CpuImpl::setThreadAffinity(size_t threadId, int64_t affinityMask)
 {
+    int cpuId = -1;
+
+    if (affinityMask != -1L) {
+        cpuId = Cpu::getAssignedCpuId(threadId, affinityMask);
+    } else {
+        cpuId = static_cast<int>(threadId);
+    }
+
+    if (cpuId > -1) {
+        thread_port_t mach_thread;
+        thread_affinity_policy_data_t policy = {static_cast<integer_t>(cpuId)};
+        mach_thread = pthread_mach_thread_np(pthread_self());
+
+        thread_policy_set(mach_thread, THREAD_AFFINITY_POLICY, (thread_policy_t) & policy, 1);
+    }
+
+    return cpuId;
 }
diff --git a/src/Cpu_unix.cpp b/src/Cpu_unix.cpp
index 191bc38e..ea83f56f 100644
--- a/src/Cpu_unix.cpp
+++ b/src/Cpu_unix.cpp
@@ -35,9 +35,8 @@
 #include <unistd.h>
 #include <string.h>
 
-
 #include "CpuImpl.h"
-
+#include "Cpu.h"
 
 #ifdef __FreeBSD__
 typedef cpuset_t cpu_set_t;
@@ -54,26 +53,31 @@ void CpuImpl::init()
 }
 
 
-void CpuImpl::setAffinity(int id, uint64_t mask)
+int CpuImpl::setThreadAffinity(size_t threadId, int64_t affinityMask)
 {
-    cpu_set_t set;
-    CPU_ZERO(&set);
+    int cpuId = -1;
 
-    for (size_t i = 0; i < threads(); i++) {
-        if (mask & (1UL << i)) {
-            CPU_SET(i, &set);
-        }
-    }
-
-    if (id == -1) {
-#       ifndef __FreeBSD__
-        sched_setaffinity(0, sizeof(&set), &set);
-#       endif
+    if (affinityMask != -1L) {
+        cpuId = Cpu::getAssignedCpuId(threadId, affinityMask);
     } else {
-#       ifndef __ANDROID__
-        pthread_setaffinity_np(pthread_self(), sizeof(&set), &set);
-#       else
-        sched_setaffinity(gettid(), sizeof(&set), &set);
-#       endif
+        cpuId = static_cast<int>(threadId);
     }
+
+    if (cpuId > -1) {
+        cpu_set_t mn;
+        CPU_ZERO(&mn);
+        CPU_SET(cpuId, &mn);
+
+#   ifndef __ANDROID__
+        if (pthread_setaffinity_np(pthread_self(), sizeof(cpu_set_t), &mn) != 0) {
+            cpuId = -1;
+        }
+#   else
+        if (sched_setaffinity(gettid(), sizeof(cpu_set_t), &mn) == -1) {
+            cpuId = -1;
+        }
+#   endif
+    }
+
+    return cpuId;
 }
diff --git a/src/Cpu_win.cpp b/src/Cpu_win.cpp
index 1560dd64..2a7e74d2 100644
--- a/src/Cpu_win.cpp
+++ b/src/Cpu_win.cpp
@@ -28,6 +28,7 @@
 
 #include "CpuImpl.h"
 #include "Mem.h"
+#include "Cpu.h"
 
 void CpuImpl::init()
 {
@@ -42,24 +43,29 @@ void CpuImpl::init()
 }
 
 
-void CpuImpl::setAffinity(int id, uint64_t mask)
+int CpuImpl::setThreadAffinity(size_t threadId, int64_t affinityMask)
 {
-    if (id == -1) {
-        SetProcessAffinityMask(GetCurrentProcess(), mask);
+    int cpuId = -1;
+
+    if (affinityMask != -1L) {
+        cpuId = Cpu::getAssignedCpuId(threadId, affinityMask);
     } else {
-        Mem::ThreadBitSet threadAffinityMask = Mem::ThreadBitSet(mask);
-
-        int threadCount = 0;
-
-        for (size_t i = 0; i < m_totalThreads; i++) {
-            if (threadAffinityMask.test(i)) {
-                if (threadCount == id) {
-                    SetThreadAffinityMask(GetCurrentThread(), 1ULL << i);
-                    break;
-                }
-
-                threadCount++;
-            }
+        if (threadId+1 > Cpu::threads()/2) {
+            cpuId = (threadId - Cpu::threads()/2) + (threadId+1 - Cpu::threads()/2);
+        } else {
+            cpuId = threadId * 2;
         }
     }
+
+    if (cpuId >= 64) {
+        cpuId = -1;
+    }
+
+    if (cpuId > -1) {
+        if (SetThreadAffinityMask(GetCurrentThread(), 1ULL << cpuId) == 0) {
+            cpuId = -1;
+        }
+    }
+
+    return cpuId;
 }
diff --git a/src/Mem.cpp b/src/Mem.cpp
index 522deab4..09cef4f2 100644
--- a/src/Mem.cpp
+++ b/src/Mem.cpp
@@ -24,25 +24,20 @@
 
 #include <memory.h>
 
-
 #include "crypto/CryptoNight.h"
 #include "Mem.h"
 
-
-int Mem::m_algo          = 0;
-int Mem::m_flags         = 0;
+bool Mem::m_useHugePages = true;
 size_t Mem::m_hashFactor = 1;
-size_t Mem::m_threads    = 0;
-size_t Mem::m_memorySize = 0;
-alignas(16) uint8_t *Mem::m_memory = nullptr;
+int Mem::m_flags         = 0;
+Options::Algo Mem::m_algo = Options::ALGO_CRYPTONIGHT;
 Mem::ThreadBitSet Mem::m_multiHashThreadMask = Mem::ThreadBitSet(-1L);
 
-cryptonight_ctx *Mem::create(int threadId)
+ScratchPadMem Mem::create(ScratchPad** scratchPads, int threadId)
 {
     size_t scratchPadSize;
 
-    switch (m_algo)
-    {
+    switch (m_algo) {
         case Options::ALGO_CRYPTONIGHT_LITE:
             scratchPadSize = MEMORY_LITE;
             break;
@@ -55,17 +50,29 @@ cryptonight_ctx *Mem::create(int threadId)
             break;
     }
 
-    size_t offset = 0;
-    for (int i=0; i < threadId; i++) {
-        offset += sizeof(cryptonight_ctx);
-        offset += scratchPadSize * getThreadHashFactor(i);
+    ScratchPadMem scratchPadMem;
+    scratchPadMem.realSize = scratchPadSize * getThreadHashFactor(threadId);
+    scratchPadMem.size = scratchPadSize * getThreadHashFactor(threadId);
+    scratchPadMem.size += scratchPadMem.size % MEMORY;
+    scratchPadMem.pages = scratchPadMem.size / MEMORY;
+
+    allocate(scratchPadMem, m_useHugePages);
+
+    for (size_t i = 0; i < getThreadHashFactor(threadId); ++i) {
+        ScratchPad* scratchPad = static_cast<ScratchPad *>(_mm_malloc(sizeof(ScratchPad), 4096));
+        scratchPad->memory     = scratchPadMem.memory + (i * scratchPadSize);
+
+        scratchPads[i] = scratchPad;
     }
 
-    auto* ctx = reinterpret_cast<cryptonight_ctx *>(&m_memory[offset]);
-
-    size_t memOffset = offset+sizeof(cryptonight_ctx);
-
-    ctx->memory = &m_memory[memOffset];
-
-    return ctx;
+    return scratchPadMem;
 }
+
+void Mem::release(ScratchPad** scratchPads, ScratchPadMem& scratchPadMem, int threadId)
+{
+    release(scratchPadMem);
+
+    for (size_t i = 0; i < getThreadHashFactor(threadId); ++i) {
+        _mm_free(scratchPads[i]);
+    }
+}
\ No newline at end of file
diff --git a/src/Mem.h b/src/Mem.h
index e2048162..032d6662 100644
--- a/src/Mem.h
+++ b/src/Mem.h
@@ -33,22 +33,47 @@
 
 #include "Options.h"
 
-struct cryptonight_ctx;
+#ifdef _WIN32
+#   ifdef __GNUC__
+#       include <mm_malloc.h>
+#   else
+#       include <malloc.h>
+#   endif
+#else
+#   if defined(XMRIG_ARM) && !defined(__clang__)
+#       include "aligned_malloc.h"
+#   else
+#       include <mm_malloc.h>
+#   endif
+#endif
+
+struct ScratchPad;
+
+struct ScratchPadMem
+{
+    alignas(16) uint8_t *memory;
+
+    size_t hugePages;
+    size_t pages;
+    size_t size;
+    size_t realSize;
+};
 
 
 class Mem
 {
 public:
     typedef std::bitset<128> ThreadBitSet;
+
     enum Flags {
         HugepagesAvailable = 1,
         HugepagesEnabled   = 2,
         Lock               = 4
     };
 
-    static bool allocate(const Options* options);
-    static cryptonight_ctx *create(int threadId);
-    static void release();
+    static void init(const Options* option);
+    static ScratchPadMem create(ScratchPad** scratchPads, int threadId);
+    static void release(ScratchPad** scratchPads, ScratchPadMem& scratchPadMem, int threadId);
 
     static inline size_t hashFactor()         { return m_hashFactor; }
     static inline size_t getThreadHashFactor(int threadId)
@@ -56,19 +81,19 @@ public:
         return (m_multiHashThreadMask.all() ||
                 m_multiHashThreadMask.test(threadId)) ? m_hashFactor : 1;
     }
+
     static inline bool isHugepagesAvailable() { return (m_flags & HugepagesAvailable) != 0; }
-    static inline bool isHugepagesEnabled()   { return (m_flags & HugepagesEnabled) != 0; }
-    static inline int flags()                 { return m_flags; }
-    static inline size_t threads()            { return m_threads; }
 
 private:
+    static void allocate(ScratchPadMem& scratchPadMem, bool useHugePages);
+    static void release(ScratchPadMem& scratchPadMem);
+
+private:
+    static bool m_useHugePages;
     static size_t m_hashFactor;
-    static size_t m_threads;
-    static int m_algo;
     static int m_flags;
+    static Options::Algo m_algo;
     static ThreadBitSet m_multiHashThreadMask;
-    static size_t m_memorySize;
-    alignas(16) static uint8_t *m_memory;
 };
 
 
diff --git a/src/Mem_unix.cpp b/src/Mem_unix.cpp
index b52c449b..48fd56f3 100644
--- a/src/Mem_unix.cpp
+++ b/src/Mem_unix.cpp
@@ -25,92 +25,61 @@
 #include <cstdlib>
 #include <sys/mman.h>
 
-
-#if defined(XMRIG_ARM) && !defined(__clang__)
-#   include "aligned_malloc.h"
-#else
-#   include <mm_malloc.h>
-#endif
-
-
 #include "crypto/CryptoNight.h"
 #include "log/Log.h"
 #include "Mem.h"
 
 
-bool Mem::allocate(const Options* options)
+void Mem::init(const Options* options)
 {
-    m_algo       = options->algo();
-    m_threads    = options->threads();
     m_hashFactor = options->hashFactor();
-    m_multiHashThreadMask = Mem::ThreadBitSet(options->multiHashThreadMask());
-    m_memorySize = 0;
+    m_useHugePages = options->hugePages();
+    m_algo = options->algo();
+    m_multiHashThreadMask = Mem::ThreadBitSet(static_cast<unsigned long long int>(options->multiHashThreadMask()));
+}
 
-    size_t scratchPadSize;
-    switch (m_algo)
-    {
-        case Options::ALGO_CRYPTONIGHT_LITE:
-            scratchPadSize = MEMORY_LITE;
-            break;
-        case Options::ALGO_CRYPTONIGHT_HEAVY:
-            scratchPadSize = MEMORY_HEAVY;
-            break;
-        case Options::ALGO_CRYPTONIGHT:
-        default:
-            scratchPadSize = MEMORY;
-            break;
+void Mem::allocate(ScratchPadMem& scratchPadMem, bool useHugePages)
+{
+    scratchPadMem.hugePages = 0;
+
+    if (!useHugePages) {
+        scratchPadMem.memory = static_cast<uint8_t*>(_mm_malloc(scratchPadMem.size, 4096));
+        return;
     }
 
-    for (size_t i=0; i < m_threads; i++) {
-        m_memorySize += sizeof(cryptonight_ctx);
-        m_memorySize += scratchPadSize * getThreadHashFactor(i);
-    }
-
-    m_memorySize = m_memorySize - (m_memorySize % MEMORY) + MEMORY;
-
-    if (!options->hugePages()) {
-        m_memory = static_cast<uint8_t*>(_mm_malloc(m_memorySize, 16));
-        return true;
-    }
-
-    m_flags |= HugepagesAvailable;
-
 #   if defined(__APPLE__)
-    m_memory = static_cast<uint8_t*>(mmap(0, m_memorySize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0));
+    scratchPadMem.memory = static_cast<uint8_t*>(mmap(0, scratchPadMem.size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, VM_FLAGS_SUPERPAGE_SIZE_2MB, 0));
 #   elif defined(__FreeBSD__)
-    m_memory = static_cast<uint8_t*>(mmap(0, m_memorySize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0));
+    scratchPadMem.memory = static_cast<uint8_t*>(mmap(0, scratchPadMem.size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER | MAP_PREFAULT_READ, -1, 0));
 #   else
-    m_memory = static_cast<uint8_t*>(mmap(nullptr, m_memorySize, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0));
+    scratchPadMem.memory = static_cast<uint8_t*>(mmap(0, scratchPadMem.size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | MAP_POPULATE, 0, 0));
 #   endif
-    if (m_memory == MAP_FAILED) {
-        m_memory = static_cast<uint8_t*>(_mm_malloc(m_memorySize, 16));
-        return true;
+
+    if (scratchPadMem.memory == MAP_FAILED) {
+        return allocate(scratchPadMem, false);
     }
 
-    m_flags |= HugepagesEnabled;
+    scratchPadMem.hugePages = scratchPadMem.pages;
 
-    if (madvise(m_memory, m_memorySize, MADV_RANDOM | MADV_WILLNEED) != 0) {
+    if (madvise(scratchPadMem.memory, scratchPadMem.size, MADV_RANDOM | MADV_WILLNEED) != 0) {
         LOG_ERR("madvise failed");
     }
 
-    if (mlock(m_memory, m_memorySize) == 0) {
+    if (mlock(scratchPadMem.memory, scratchPadMem.size) == 0) {
         m_flags |= Lock;
     }
-
-    return true;
 }
 
-
-void Mem::release()
+void Mem::release(ScratchPadMem &scratchPadMem)
 {
-    if (m_flags & HugepagesEnabled) {
+    if (scratchPadMem.hugePages) {
         if (m_flags & Lock) {
-            munlock(m_memory, m_memorySize);
+            munlock(scratchPadMem.memory, scratchPadMem.size);
         }
 
-        munmap(m_memory, m_memorySize);
+        munmap(scratchPadMem.memory, scratchPadMem.size);
     }
     else {
-        _mm_free(m_memory);
+        _mm_free(scratchPadMem.memory);
     }
 }
diff --git a/src/Mem_win.cpp b/src/Mem_win.cpp
index a52cc5b5..d6ee6ba5 100644
--- a/src/Mem_win.cpp
+++ b/src/Mem_win.cpp
@@ -27,12 +27,6 @@
 #include <ntsecapi.h>
 #include <tchar.h>
 
-#ifdef __GNUC__
-#   include <mm_malloc.h>
-#else
-#   include <malloc.h>
-#endif
-
 #include "log/Log.h"
 #include "crypto/CryptoNight.h"
 #include "Mem.h"
@@ -144,63 +138,44 @@ static BOOL TrySetLockPagesPrivilege() {
 }
 
 
-bool Mem::allocate(const Options* options)
+void Mem::init(const Options* options)
 {
-    m_algo       = options->algo();
-    m_threads    = options->threads();
     m_hashFactor = options->hashFactor();
-    m_multiHashThreadMask = Mem::ThreadBitSet(options->multiHashThreadMask());
-    m_memorySize = 0;
+    m_useHugePages = options->hugePages();
+    m_algo = options->algo();
+    m_multiHashThreadMask = Mem::ThreadBitSet(static_cast<unsigned long long int>(options->multiHashThreadMask()));
 
-    size_t scratchPadSize;
-    switch (m_algo)
-    {
-        case Options::ALGO_CRYPTONIGHT_LITE:
-            scratchPadSize = MEMORY_LITE;
-            break;
-        case Options::ALGO_CRYPTONIGHT_HEAVY:
-            scratchPadSize = MEMORY_HEAVY;
-            break;
-        case Options::ALGO_CRYPTONIGHT:
-        default:
-            scratchPadSize = MEMORY;
-            break;
+    if (m_useHugePages && TrySetLockPagesPrivilege()) {
+	    m_flags |= HugepagesAvailable;
     }
-
-    for (size_t i=0; i < m_threads; i++) {
-        m_memorySize += sizeof(cryptonight_ctx);
-        m_memorySize += scratchPadSize * getThreadHashFactor(i);
-    }
-
-    m_memorySize = m_memorySize - (m_memorySize % MEMORY) + MEMORY;
-
-    if (!options->hugePages()) {
-        m_memory = static_cast<uint8_t*>(_mm_malloc(m_memorySize, 16));
-        return true;
-    }
-
-    if (TrySetLockPagesPrivilege()) {
-        m_flags |= HugepagesAvailable;
-    }
-
-    m_memory = static_cast<uint8_t*>(VirtualAlloc(NULL, m_memorySize, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE));
-    if (!m_memory) {
-        m_memory = static_cast<uint8_t*>(_mm_malloc(m_memorySize, 16));
-    }
-    else {
-        m_flags |= HugepagesEnabled;
-    }
-
-    return true;
 }
 
-
-void Mem::release()
+void Mem::allocate(ScratchPadMem& scratchPadMem, bool useHugePages)
 {
-    if (m_flags & HugepagesEnabled) {
-        VirtualFree(m_memory, 0, MEM_RELEASE);
+    scratchPadMem.hugePages = 0;
+
+    if (!useHugePages) {
+        scratchPadMem.memory = static_cast<uint8_t*>(_mm_malloc(scratchPadMem.size, 4096));
+        return;
+    }
+
+    scratchPadMem.memory = static_cast<uint8_t*>(VirtualAlloc(nullptr, scratchPadMem.size, MEM_COMMIT | MEM_RESERVE | MEM_LARGE_PAGES, PAGE_READWRITE));
+    if (scratchPadMem.memory) {
+        scratchPadMem.hugePages = scratchPadMem.pages;
+
+        return;
+    }
+
+    allocate(scratchPadMem, false);
+}
+
+
+void Mem::release(ScratchPadMem &scratchPadMem)
+{
+    if (scratchPadMem.hugePages) {
+        VirtualFree(scratchPadMem.memory, 0, MEM_RELEASE);
     }
     else {
-        _mm_free(m_memory);
+        _mm_free(scratchPadMem.memory);
     }
-}
+}
\ No newline at end of file
diff --git a/src/Options.cpp b/src/Options.cpp
index 2d6682ab..11812ddd 100644
--- a/src/Options.cpp
+++ b/src/Options.cpp
@@ -279,7 +279,7 @@ constexpr static const char *pow_variant_names[] = {
         "auto",
         "0",
         "1",
-        "ipbc",
+        "tube",
         "alloy",
         "xtl",
         "msr",
@@ -984,7 +984,7 @@ bool Options::setAlgo(const char *algo)
         if (i == ARRAY_SIZE(algo_names) - 1 && (!strcmp(algo, "cryptonight-lite-ipbc") || !strcmp(algo, "cryptonight-light-ipbc") || !strcmp(algo, "cn-lite-ipbc"))) {
             showDeprecateWarning("cryptonight-light-ipbc", "cryptonight-light (with variant \"ipbc\")");
             m_algo = ALGO_CRYPTONIGHT_LITE;
-            m_powVariant = POW_IPBC;
+            m_powVariant = POW_TUBE;
             break;
         }
 
@@ -1025,8 +1025,8 @@ bool Options::parsePowVariant(const char *powVariant)
             break;
         }
 
-        if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "tube")) {
-            m_powVariant = POW_IPBC;
+        if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "ipbc") || !strcmp(powVariant, "bittube"))) {
+            m_powVariant = POW_TUBE;
             break;
         }
 
diff --git a/src/PowVariant.h b/src/PowVariant.h
index 59c8813a..fc20c02a 100644
--- a/src/PowVariant.h
+++ b/src/PowVariant.h
@@ -27,7 +27,7 @@ enum PowVariant
     POW_AUTODETECT,
     POW_V0,
     POW_V1,
-    POW_IPBC,
+    POW_TUBE,
     POW_ALLOY,
     POW_XTL,
     POW_MSR,
@@ -44,8 +44,8 @@ inline std::string getPowVariantName(PowVariant powVariant)
             return "0";
         case POW_V1:
             return "1";
-        case POW_IPBC:
-            return "ipbc";
+        case POW_TUBE:
+            return "tube";
         case POW_ALLOY:
             return "alloy";
         case POW_XTL:
@@ -104,8 +104,8 @@ inline PowVariant parseVariant(const std::string variant)
         powVariant = PowVariant::POW_V0;
     } else if (variant == "1") {
         powVariant = PowVariant::POW_V1;
-    } else if (variant == "ipbc" || variant == "tube") {
-        powVariant = PowVariant::POW_IPBC;
+    } else if (variant == "ipbc" || variant == "tube" || variant == "bittube") {
+        powVariant = PowVariant::POW_TUBE;
     } else if (variant == "xao" || variant == "alloy") {
         powVariant = PowVariant::POW_ALLOY;
     } else if (variant == "xtl" || variant == "stellite") {
diff --git a/src/Summary.cpp b/src/Summary.cpp
index 0bb6386b..cfad1e14 100644
--- a/src/Summary.cpp
+++ b/src/Summary.cpp
@@ -56,18 +56,6 @@ static void print_versions()
 }
 
 
-static void print_memory() {
-    if (Options::i()->colors()) {
-        Log::i()->text("\x1B[01;32m * \x1B[01;37mHUGE PAGES:   %s, %s",
-                       Mem::isHugepagesAvailable() ? "\x1B[01;32mavailable" : "\x1B[01;31munavailable",
-                       Mem::isHugepagesEnabled() ? "\x1B[01;32menabled" : "\x1B[01;31mdisabled");
-    }
-    else {
-        Log::i()->text(" * HUGE PAGES:   %s, %s", Mem::isHugepagesAvailable() ? "available" : "unavailable", Mem::isHugepagesEnabled() ? "enabled" : "disabled");
-    }
-}
-
-
 static void print_cpu()
 {
     if (Options::i()->colors()) {
@@ -125,14 +113,15 @@ static void print_threads()
         snprintf(affBuf, 32, ", affinity=0x%" PRIX64, Options::i()->affinity());
     }
     else {
-        affBuf[0] = '\0';
+        snprintf(affBuf, 32, ", affinity=auto");
     }
 
     Log::i()->text(Options::i()->colors() ?
-                     "\x1B[01;32m * \x1B[01;37mTHREADS:      \x1B[01;36m%d\x1B[01;37m, %s, aes=%d, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" :
-                     " * THREADS:      %d, %s, aes=%d, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s",
+                     "\x1B[01;32m * \x1B[01;37mTHREADS:      \x1B[01;36m%d\x1B[01;37m, %s, %saes=%d\x1B[01;37m, hf=%zu, %sdonate=%d%%\x1B[01;37m%s%s" :
+                     " * THREADS:      %d, %s, %saes=%d, hf=%zu, %sdonate=%d%%%s%s",
                    Options::i()->threads(),
                    Options::i()->algoName(),
+                   Options::i()->colors() && Options::i()->aesni() == 0 ? "\x1B[01;31m" : "",
                    Options::i()->aesni(),
                    Options::i()->hashFactor(),
                    Options::i()->colors() && Options::i()->donateLevel() == 0 ? "\x1B[01;31m" : "",
@@ -201,7 +190,6 @@ static void print_commands()
 void Summary::print()
 {
     print_versions();
-    print_memory();
     print_cpu();
     print_threads();
     print_pools();
diff --git a/src/api/ApiState.cpp b/src/api/ApiState.cpp
index c963a1d6..2e85a1ac 100644
--- a/src/api/ApiState.cpp
+++ b/src/api/ApiState.cpp
@@ -231,7 +231,7 @@ void ApiState::getMiner(rapidjson::Document &doc) const
     doc.AddMember("ua",           rapidjson::StringRef(Platform::userAgent()), allocator);
     doc.AddMember("cpu",          cpu, allocator);
     doc.AddMember("algo",         rapidjson::StringRef(Options::i()->algoName()), allocator);
-    doc.AddMember("hugepages",    Mem::isHugepagesEnabled(), allocator);
+    doc.AddMember("hugepages",    Mem::isHugepagesAvailable(), allocator);
     doc.AddMember("donate_level", Options::i()->donateLevel(), allocator);
 }
 
diff --git a/src/cc/CCClient.cpp b/src/cc/CCClient.cpp
index 55550576..4dc15b1b 100644
--- a/src/cc/CCClient.cpp
+++ b/src/cc/CCClient.cpp
@@ -79,7 +79,6 @@ CCClient::CCClient(Options* options, uv_async_t* async)
         m_clientStatus.setCurrentAlgoName(m_options->algoName());
     }
 
-    m_clientStatus.setHugepagesEnabled(Mem::isHugepagesEnabled());
     m_clientStatus.setHugepages(Mem::isHugepagesAvailable());
     m_clientStatus.setHashFactor(Mem::hashFactor());
 
@@ -265,10 +264,10 @@ std::shared_ptr<httplib::Response> CCClient::performRequest(const std::string& r
 
 #   ifndef XMRIG_NO_TLS
     if (m_self->m_options->ccUseTls()) {
-        cli = std::make_shared<httplib::SSLClient>(m_self->m_options->ccHost(), m_self->m_options->ccPort());
+        cli = std::make_shared<httplib::SSLClient>(m_self->m_options->ccHost(), m_self->m_options->ccPort(), 10);
     } else {
 #   endif
-        cli = std::make_shared<httplib::Client>(m_self->m_options->ccHost(), m_self->m_options->ccPort());
+        cli = std::make_shared<httplib::Client>(m_self->m_options->ccHost(), m_self->m_options->ccPort(), 10);
 #   ifndef XMRIG_NO_TLS
     }
 #   endif
diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp
index e0f90361..bf9b0b08 100644
--- a/src/crypto/CryptoNight.cpp
+++ b/src/crypto/CryptoNight.cpp
@@ -34,88 +34,94 @@
 #include "crypto/CryptoNight_test.h"
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) {
+static void cryptonight_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
 #   if !defined(XMRIG_ARMv7)
     if (powVersion == PowVariant::POW_V1) {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_ALLOY) {
-        CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, ctx);
+        CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_XTL) {
-        CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_MSR) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_RTO) {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
     }else {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
     }
 #   endif
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) {
+static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
     if (powVersion == PowVariant::POW_V1) {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_ALLOY) {
-        CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, ctx);
+        CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_XTL) {
-        CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_MSR) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_RTO) {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
     } else {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, ctx);
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
     }
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) {
+static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
 #   if !defined(XMRIG_ARMv7)
     if (powVersion == PowVariant::POW_V1) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
-    } else if (powVersion == PowVariant::POW_IPBC) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
+    } else if (powVersion == PowVariant::POW_TUBE) {
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
     } else {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
     }
 #   endif
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) {
+static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
     if (powVersion == PowVariant::POW_V1) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, ctx);
-    } else if (powVersion == PowVariant::POW_IPBC) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteIpbc(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
+    } else if (powVersion == PowVariant::POW_TUBE) {
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
     } else {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
     }
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) {
+static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
 #   if !defined(XMRIG_ARMv7)
     if (powVersion == PowVariant::POW_XHV) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad);
+    }
+    else if (powVersion == PowVariant::POW_TUBE) {
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad);
     }
     else {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad);
     }
 #   endif
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx) {
+static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
     if (powVersion == PowVariant::POW_XHV) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad);
+    }
+    else if (powVersion == PowVariant::POW_TUBE) {
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyTube(input, size, output, scratchPad);
     }
     else {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, ctx);
+        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavy(input, size, output, scratchPad);
     }
 }
 
-void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx *ctx);
+void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad);
 
 template <size_t HASH_FACTOR>
 void setCryptoNightHashMethods(Options::Algo algo, bool aesni)
@@ -161,9 +167,9 @@ bool CryptoNight::init(int algo, bool aesni)
     return selfTest(algo);
 }
 
-void CryptoNight::hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx* ctx)
+void CryptoNight::hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad)
 {
-    cryptonight_hash_ctx[factor-1](powVersion, input, size, output, ctx);
+    cryptonight_hash_ctx[factor-1](powVersion, input, size, output, scratchPad);
 }
 
 bool CryptoNight::selfTest(int algo)
@@ -187,8 +193,14 @@ bool CryptoNight::selfTest(int algo)
 
     uint8_t output[160];
 
-    auto ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16);
-    ctx->memory = (uint8_t *) _mm_malloc(MEMORY * 6, 16);
+    ScratchPad* scratchPads [MAX_NUM_HASH_BLOCKS];
+
+    for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) {
+        ScratchPad* scratchPad = static_cast<ScratchPad *>(_mm_malloc(sizeof(ScratchPad), 4096));
+        scratchPad->memory     = (uint8_t *) _mm_malloc(MEMORY * 6, 16);
+
+        scratchPads[i] = scratchPad;
+    }
 
     bool result = true;
     bool resultLite = true;
@@ -197,188 +209,206 @@ bool CryptoNight::selfTest(int algo)
     if (algo == Options::ALGO_CRYPTONIGHT_HEAVY) {
         // cn-heavy
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 96) == 0;
         #endif
 
         // cn-heavy haven
 
-        cryptonight_hash_ctx[0](PowVariant::POW_XHV, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_XHV, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_XHV, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_XHV, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_XHV, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_XHV, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 96) == 0;
         #endif
+
+        // cn-heavy bittube
+
+        cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 32) == 0;
+
+        #if MAX_NUM_HASH_BLOCKS > 1
+        cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 64) == 0;
+        #endif
+
+        #if MAX_NUM_HASH_BLOCKS > 2
+        cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 96) == 0;
+        #endif
     } else if (algo == Options::ALGO_CRYPTONIGHT_LITE) {
         // cn-lite v0
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 160) == 0;
         #endif
 
         // cn-lite v7 tests
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v1_lite, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 160) == 0;
         #endif
 
 
         // cn-lite ibpc tests
 
-        cryptonight_hash_ctx[0](PowVariant::POW_IPBC, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_IPBC, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_IPBC, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_IPBC, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[3](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_IPBC, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[4](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 160) == 0;
         #endif
 
     } else {
         // cn v0
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V0,test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_V0,test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 160) == 0;
         #endif
 
         // cn v7
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 160) == 0;
         #endif
 
         // cn xtl
 
-        cryptonight_hash_ctx[0](PowVariant::POW_XTL,test_input, 76, output, ctx);
+        cryptonight_hash_ctx[0](PowVariant::POW_XTL,test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_XTL, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[1](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_XTL, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[2](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_XTL, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[3](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_XTL, test_input, 76, output, ctx);
+        cryptonight_hash_ctx[4](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 160) == 0;
         #endif
     }
-    _mm_free(ctx->memory);
-    _mm_free(ctx);
+
+    for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) {
+        _mm_free(scratchPads[i]->memory);
+        _mm_free(scratchPads[i]);
+    }
 
     return result && resultLite & resultHeavy;
 }
\ No newline at end of file
diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h
index 4032802d..753c56fc 100644
--- a/src/crypto/CryptoNight.h
+++ b/src/crypto/CryptoNight.h
@@ -37,8 +37,8 @@
 #define POW_DEFAULT_INDEX_SHIFT 3
 #define POW_XLT_V4_INDEX_SHIFT 4
 
-struct cryptonight_ctx {
-    alignas(16) uint8_t state[MAX_NUM_HASH_BLOCKS][208]; // 208 instead of 200 to maintain aligned to 16 byte boundaries
+struct ScratchPad {
+    alignas(16) uint8_t state[208]; // 208 instead of 200 to maintain aligned to 16 byte boundaries
     alignas(16) uint8_t* memory;
 };
 
@@ -51,7 +51,7 @@ class CryptoNight
 public:
     static bool init(int algo, bool aesni);
 
-    static void hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, cryptonight_ctx* ctx);
+    static void hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads);
 
 private:
     static bool selfTest(int algo);
diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h
index c823d00f..377c0002 100644
--- a/src/crypto/CryptoNight_arm.h
+++ b/src/crypto/CryptoNight_arm.h
@@ -31,9 +31,12 @@
 #if defined(XMRIG_ARM) && !defined(__clang__)
 #   include "aligned_malloc.h"
 #else
+
 #   include <mm_malloc.h>
+
 #endif
 
+#include <signal.h>
 
 #include "crypto/CryptoNight.h"
 #include "crypto/soft_aes.h"
@@ -48,27 +51,32 @@ extern "C"
 #include "crypto/c_skein.h"
 }
 
-static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) {
+static inline void do_blake_hash(const uint8_t* input, size_t len, uint8_t* output)
+{
     blake256_hash(output, input, len);
 }
 
 
-static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) {
+static inline void do_groestl_hash(const uint8_t* input, size_t len, uint8_t* output)
+{
     groestl(input, len * 8, output);
 }
 
 
-static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) {
+static inline void do_jh_hash(const uint8_t* input, size_t len, uint8_t* output)
+{
     jh_hash(32 * 8, input, 8 * len, output);
 }
 
 
-static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) {
+static inline void do_skein_hash(const uint8_t* input, size_t len, uint8_t* output)
+{
     xmr_skein(input, output);
 }
 
 
-void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+void (* const extra_hashes[4])(const uint8_t*, size_t, uint8_t*) = {do_blake_hash, do_groestl_hash, do_jh_hash,
+                                                                    do_skein_hash};
 
 
 static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64_t a, const uint64_t b)
@@ -76,6 +84,21 @@ static inline __attribute__((always_inline)) __m128i _mm_set_epi64x(const uint64
     return vcombine_u64(vcreate_u64(b), vcreate_u64(a));
 }
 
+#if __ARM_FEATURE_CRYPTO
+static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey)
+{
+    alignas(16) const __m128i zero = { 0 };
+    return veorq_u8(vaesmcq_u8(vaeseq_u8(v, zero)), rkey );
+}
+#else
+
+static inline __attribute__((always_inline)) __m128i _mm_aesenc_si128(__m128i v, __m128i rkey)
+{
+    alignas(16) const __m128i zero = {0};
+    return zero;
+}
+
+#endif
 
 /* this one was not implemented yet so here it is */
 static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i a)
@@ -87,7 +110,7 @@ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i
 #define EXTRACT64(X) _mm_cvtsi128_si64(X)
 
 
-#if defined(XMRIG_ARMv8)
+#if defined (__arm64__) || defined (__aarch64__)
 static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi)
 {
     unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
@@ -95,7 +118,9 @@ static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi)
     return (uint64_t) r;
 }
 #else
-static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t *product_hi) {
+
+static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
+{
     // multiplier   = ab = a * 2^32 + b
     // multiplicand = cd = c * 2^32 + d
     // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
@@ -119,6 +144,7 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin
 
     return product_lo;
 }
+
 #endif
 
 
@@ -141,44 +167,48 @@ template<uint8_t rcon>
 static inline void soft_aes_genkey_sub(__m128i* xout0, __m128i* xout2)
 {
     __m128i xout1 = soft_aeskeygenassist<rcon>(*xout2);
-    xout1  = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
+    xout1 = _mm_shuffle_epi32(xout1, 0xFF); // see PSHUFD, set all elems to 4th elem
     *xout0 = sl_xor(*xout0);
     *xout0 = _mm_xor_si128(*xout0, xout1);
-    xout1  = soft_aeskeygenassist<0x00>(*xout0);
-    xout1  = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
+    xout1 = soft_aeskeygenassist<0x00>(*xout0);
+    xout1 = _mm_shuffle_epi32(xout1, 0xAA); // see PSHUFD, set all elems to 3rd elem
     *xout2 = sl_xor(*xout2);
     *xout2 = _mm_xor_si128(*xout2, xout1);
 }
 
 
 template<bool SOFT_AES>
-static inline void aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5, __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
+static inline void
+aes_genkey(const __m128i* memory, __m128i* k0, __m128i* k1, __m128i* k2, __m128i* k3, __m128i* k4, __m128i* k5,
+           __m128i* k6, __m128i* k7, __m128i* k8, __m128i* k9)
 {
     __m128i xout0 = _mm_load_si128(memory);
     __m128i xout2 = _mm_load_si128(memory + 1);
     *k0 = xout0;
     *k1 = xout2;
 
-    SOFT_AES ? soft_aes_genkey_sub<0x01>(&xout0, &xout2) : soft_aes_genkey_sub<0x01>(&xout0, &xout2);
+    soft_aes_genkey_sub<0x01>(&xout0, &xout2);
     *k2 = xout0;
     *k3 = xout2;
 
-    SOFT_AES ? soft_aes_genkey_sub<0x02>(&xout0, &xout2) : soft_aes_genkey_sub<0x02>(&xout0, &xout2);
+    soft_aes_genkey_sub<0x02>(&xout0, &xout2);
     *k4 = xout0;
     *k5 = xout2;
 
-    SOFT_AES ? soft_aes_genkey_sub<0x04>(&xout0, &xout2) : soft_aes_genkey_sub<0x04>(&xout0, &xout2);
+    soft_aes_genkey_sub<0x04>(&xout0, &xout2);
     *k6 = xout0;
     *k7 = xout2;
 
-    SOFT_AES ? soft_aes_genkey_sub<0x08>(&xout0, &xout2) : soft_aes_genkey_sub<0x08>(&xout0, &xout2);
+    soft_aes_genkey_sub<0x08>(&xout0, &xout2);
     *k8 = xout0;
     *k9 = xout2;
 }
 
 
 template<bool SOFT_AES>
-static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6, __m128i* x7)
+static inline void
+aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2, __m128i* x3, __m128i* x4, __m128i* x5, __m128i* x6,
+          __m128i* x7)
 {
     if (SOFT_AES) {
         *x0 = soft_aesenc((uint32_t*)x0, key);
@@ -201,11 +231,23 @@ static inline void aes_round(__m128i key, __m128i* x0, __m128i* x1, __m128i* x2,
         *x6 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x6), key));
         *x7 = vaesmcq_u8(vaeseq_u8(*((uint8x16_t *) x7), key));
     }
+#   else
+    else {
+        *x0 = _mm_aesenc_si128(*x0, key);
+        *x1 = _mm_aesenc_si128(*x1, key);
+        *x2 = _mm_aesenc_si128(*x2, key);
+        *x3 = _mm_aesenc_si128(*x3, key);
+        *x4 = _mm_aesenc_si128(*x4, key);
+        *x5 = _mm_aesenc_si128(*x5, key);
+        *x6 = _mm_aesenc_si128(*x6, key);
+        *x7 = _mm_aesenc_si128(*x7, key);
+    }
 #   endif
 }
 
 
-inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6, __m128i& x7)
+inline void mix_and_propagate(__m128i& x0, __m128i& x1, __m128i& x2, __m128i& x3, __m128i& x4, __m128i& x5, __m128i& x6,
+                              __m128i& x7)
 {
     __m128i tmp0 = x0;
     x0 = _mm_xor_si128(x0, x1);
@@ -292,6 +334,7 @@ static inline void cn_explode_scratchpad_heavy(const __m128i* input, __m128i* ou
     xin7 = _mm_load_si128(input + 11);
 
     for (size_t i = 0; i < 16; i++) {
+
         if (!SOFT_AES) {
             aes_round<SOFT_AES>(_mm_setzero_si128(), &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
         }
@@ -315,8 +358,7 @@ static inline void cn_explode_scratchpad_heavy(const __m128i* input, __m128i* ou
             xin5 ^= k9;
             xin6 ^= k9;
             xin7 ^= k9;
-        }
-        else {
+        } else {
             aes_round<SOFT_AES>(k9, &xin0, &xin1, &xin2, &xin3, &xin4, &xin5, &xin6, &xin7);
         }
 
@@ -444,8 +486,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou
     xout6 = _mm_load_si128(output + 10);
     xout7 = _mm_load_si128(output + 11);
 
-    for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8)
-    {
+    for (size_t i = 0; i < MEM / sizeof(__m128i); i += 8) {
         xout0 = _mm_xor_si128(_mm_load_si128(input + i + 0), xout0);
         xout1 = _mm_xor_si128(_mm_load_si128(input + i + 1), xout1);
         xout2 = _mm_xor_si128(_mm_load_si128(input + i + 2), xout2);
@@ -478,8 +519,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou
             xout5 ^= k9;
             xout6 ^= k9;
             xout7 ^= k9;
-        }
-        else {
+        } else {
             aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
         }
 
@@ -519,8 +559,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou
             xout5 ^= k9;
             xout6 ^= k9;
             xout7 ^= k9;
-        }
-        else {
+        } else {
             aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
         }
 
@@ -551,8 +590,7 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou
             xout5 ^= k9;
             xout6 ^= k9;
             xout7 ^= k9;
-        }
-        else {
+        } else {
             aes_round<SOFT_AES>(k9, &xout0, &xout1, &xout2, &xout3, &xout4, &xout5, &xout6, &xout7);
         }
 
@@ -576,8 +614,8 @@ class CryptoNightMultiHash
 public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -588,12 +626,12 @@ public:
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size,
-                   ctx->state[hashBlock], 200);
+                   scratchPad[hashBlock]->state, 200);
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -608,12 +646,11 @@ public:
                 __m128i cx;
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK],
+                                     _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 } else {
-                    cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
-#           ifndef XMRIG_ARMv7
-                    cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
-#           endif
+                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 }
 
                 _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
@@ -642,15 +679,15 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
-                                                       output + hashBlock * 32);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
         }
     }
 
     inline static void hashPowV2(const uint8_t* __restrict__ input,
-                              size_t size,
-                              uint8_t *__restrict__ output,
-                              cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -661,14 +698,15 @@ public:
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200);
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state,
+                   200);
             tweak1_2[hashBlock] = (*reinterpret_cast<const uint64_t*>(input + 35 + hashBlock * size) ^
-                                   *(reinterpret_cast<const uint64_t*>(ctx->state[hashBlock]) + 24));
+                                   *(reinterpret_cast<const uint64_t*>(scratchPad[hashBlock]->state) + 24));
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -684,12 +722,11 @@ public:
                 __m128i cx;
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK],
+                                     _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 } else {
-                    cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
-#           ifndef XMRIG_ARMv7
-                    cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
-#           endif
+                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 }
 
                 _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
@@ -698,7 +735,7 @@ public:
                 const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
                 static const uint32_t table = 0x75310;
                 const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-                ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+                ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
 
                 idx[hashBlock] = EXTRACT64(cx);
                 bx[hashBlock] = cx;
@@ -727,15 +764,15 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
-                                                       output + hashBlock * 32);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
         }
     }
 
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -746,14 +783,15 @@ public:
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200);
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state,
+                   200);
             tweak1_2[hashBlock] = (*reinterpret_cast<const uint64_t*>(input + 35 + hashBlock * size) ^
-                                   *(reinterpret_cast<const uint64_t*>(ctx->state[hashBlock]) + 24));
+                                   *(reinterpret_cast<const uint64_t*>(scratchPad[hashBlock]->state) + 24));
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -769,12 +807,11 @@ public:
                 __m128i cx;
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK],
+                                     _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 } else {
-                    cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
-#           ifndef XMRIG_ARMv7
-                    cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
-#           endif
+                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 }
 
                 _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
@@ -783,7 +820,7 @@ public:
                 const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
                 static const uint32_t table = 0x75310;
                 const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-                ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+                ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
 
                 idx[hashBlock] = EXTRACT64(cx);
                 bx[hashBlock] = cx;
@@ -803,7 +840,8 @@ public:
 
                 ah[hashBlock] ^= tweak1_2[hashBlock];
 
-                ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0];
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*) &l[hashBlock][idx[hashBlock] &
+                                                                                                    MASK])[0];
 
                 ah[hashBlock] ^= ch;
                 al[hashBlock] ^= cl;
@@ -814,15 +852,15 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
-                                                       output + hashBlock * 32);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
         }
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -833,12 +871,12 @@ public:
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size,
-                   ctx->state[hashBlock], 200);
+                   scratchPad[hashBlock]->state, 200);
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -853,12 +891,11 @@ public:
                 __m128i cx;
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK],
+                                     _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 } else {
-                    cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
-#           ifndef XMRIG_ARMv7
-                    cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
-#           endif
+                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 }
 
                 _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
@@ -882,11 +919,13 @@ public:
                 al[hashBlock] ^= cl;
                 idx[hashBlock] = al[hashBlock];
 
-                int64_t n  = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0];
-                int32_t d  = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2];
-                int64_t q = n / (d | 0x5);
+                const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t*>(&l[hashBlock][idx[hashBlock] & MASK]));
+                const int64_t n = vgetq_lane_s64(x, 0);
+                const int32_t d = vgetq_lane_s32(x, 2);
+                const int64_t q = n / (d | 0x5);
+
+                ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
 
-                ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
                 idx[hashBlock] = d ^ q;
             }
         }
@@ -894,15 +933,15 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
-                                                       output + hashBlock * 32);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
         }
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -913,12 +952,12 @@ public:
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size,
-                   ctx->state[hashBlock], 200);
+                   scratchPad[hashBlock]->state, 200);
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -933,12 +972,11 @@ public:
                 __m128i cx;
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*)&l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK],
+                                     _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 } else {
-                    cx = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
-#           ifndef XMRIG_ARMv7
-                    cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
-#           endif
+                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
                 }
 
                 _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
@@ -962,11 +1000,13 @@ public:
                 al[hashBlock] ^= cl;
                 idx[hashBlock] = al[hashBlock];
 
-                int64_t n  = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0];
-                int32_t d  = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2];
-                int64_t q = n / (d | 0x5);
+                const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t*>(&l[hashBlock][idx[hashBlock] & MASK]));
+                const int64_t n = vgetq_lane_s64(x, 0);
+                const int32_t d = vgetq_lane_s32(x, 2);
+                const int64_t q = n / (d | 0x5);
+
+                ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
 
-                ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
                 idx[hashBlock] = (~d) ^ q;
             }
         }
@@ -974,8 +1014,129 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
-                                                       output + hashBlock * 32);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
+        }
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l[NUM_HASH_BLOCKS];
+        uint64_t* h[NUM_HASH_BLOCKS];
+        uint64_t al[NUM_HASH_BLOCKS];
+        uint64_t ah[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        uint64_t idx[NUM_HASH_BLOCKS];
+        uint64_t tweak1_2[NUM_HASH_BLOCKS];
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state,
+                   200);
+            tweak1_2[hashBlock] = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 +
+                                                                      hashBlock * size) ^
+                                   *(reinterpret_cast<const uint64_t*>(scratchPad[hashBlock]->state) + 24));
+        }
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
+
+            cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
+
+            al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+            ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5];
+            bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
+            idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+        }
+
+        union alignas(16)
+        {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                __m128i cx;
+
+                cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+
+                const __m128i& key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
+
+                _mm_store_si128((__m128i*) k, key);
+                cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+                _mm_store_si128((__m128i*) x, cx);
+
+                k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                        saes_table[3][BYTE(x[3], 3)];
+                x[0] ^= k[0];
+                k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                        saes_table[3][BYTE(x[0], 3)];
+                x[1] ^= k[1];
+                k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                        saes_table[3][BYTE(x[1], 3)];
+                x[2] ^= k[2];
+                k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                        saes_table[3][BYTE(x[2], 3)];
+
+                cx = _mm_load_si128((__m128i*) k);
+
+                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx));
+
+                const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
+                static const uint32_t table = 0x75310;
+                const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+                ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+                idx[hashBlock] = EXTRACT64(cx);
+                bx[hashBlock] = cx;
+
+                uint64_t hi, lo, cl, ch;
+                cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
+                ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
+                lo = __umul128(idx[hashBlock], cl, &hi);
+
+                al[hashBlock] += hi;
+                ah[hashBlock] += lo;
+
+                ah[hashBlock] ^= tweak1_2[hashBlock];
+
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock];
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock];
+
+                ah[hashBlock] ^= tweak1_2[hashBlock];
+
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*) &l[hashBlock][idx[hashBlock] &
+                                                                                                    MASK])[0];
+
+                ah[hashBlock] ^= ch;
+                al[hashBlock] ^= cl;
+                idx[hashBlock] = al[hashBlock];
+
+                const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t*>(&l[hashBlock][idx[hashBlock] & MASK]));
+                const int64_t n = vgetq_lane_s64(x, 0);
+                const int32_t d = vgetq_lane_s32(x, 2);
+                const int64_t q = n / (d | 0x5);
+
+                ((int64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
+
+                idx[hashBlock] = d ^ q;
+            }
+        }
+
+#undef BYTE
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
+            keccakf(h[hashBlock], 24);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
         }
     }
 };
@@ -986,8 +1147,8 @@ class CryptoNightMultiHash<ITERATIONS, INDEX_SHIFT, MEM, MASK, SOFT_AES, 1>
 public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -996,10 +1157,10 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -1012,13 +1173,10 @@ public:
             __m128i cx;
 
             if (SOFT_AES) {
-                cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al));
-            }
-            else {
-                cx = _mm_load_si128((__m128i *) &l[idx & MASK]);
-    #           ifndef XMRIG_ARMv7
-                cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al);
-    #           endif
+                cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al));
+            } else {
+                cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+                cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al));
             }
 
             _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
@@ -1043,83 +1201,13 @@ public:
 
         cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-  }
-
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
-                          size_t size,
-                          uint8_t *__restrict__ output,
-                          cryptonight_ctx* __restrict__ ctx)
-  {
-    const uint8_t* l;
-    uint64_t* h;
-    uint64_t al;
-    uint64_t ah;
-    __m128i bx;
-    uint64_t idx;
-
-    keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
-
-    uint64_t tweak1_2 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                        *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-    l = ctx->memory;
-    h = reinterpret_cast<uint64_t*>(ctx->state[0]);
-
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
-
-    al = h[0] ^ h[4];
-    ah = h[1] ^ h[5];
-    bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
-    idx = h[0] ^ h[4];
-
-    for (size_t i = 0; i < ITERATIONS; i++) {
-      __m128i cx;
-
-      if (SOFT_AES) {
-        cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al));
-      }
-      else {
-        cx = _mm_load_si128((__m128i *) &l[idx & MASK]);
-        #           ifndef XMRIG_ARMv7
-        cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al);
-        #           endif
-      }
-
-      _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
-      const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[idx & MASK])[11];
-      static const uint32_t table = 0x75310;
-      const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-      ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-      idx = EXTRACT64(cx);
-      bx = cx;
-
-      uint64_t hi, lo, cl, ch;
-      cl = ((uint64_t*) &l[idx & MASK])[0];
-      ch = ((uint64_t*) &l[idx & MASK])[1];
-      lo = __umul128(idx, cl, &hi);
-
-      al += hi;
-      ah += lo;
-
-      ah ^= tweak1_2;
-      ((uint64_t*) &l[idx & MASK])[0] = al;
-      ((uint64_t*) &l[idx & MASK])[1] = ah;
-      ah ^= tweak1_2;
-
-      ah ^= ch;
-      al ^= cl;
-      idx = al;
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
-    keccakf(h, 24);
-    extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-  }
-
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
                                  size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -1128,12 +1216,12 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
         uint64_t tweak1_2 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -1146,20 +1234,17 @@ public:
             __m128i cx;
 
             if (SOFT_AES) {
-                cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al));
-            }
-            else {
-                cx = _mm_load_si128((__m128i *) &l[idx & MASK]);
-#           ifndef XMRIG_ARMv7
-                cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al);
-#           endif
+                cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al));
+            } else {
+                cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+                cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al));
             }
 
             _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
             const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[idx & MASK])[11];
             static const uint32_t table = 0x75310;
             const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             idx = EXTRACT64(cx);
             bx = cx;
 
@@ -1176,8 +1261,6 @@ public:
             ((uint64_t*) &l[idx & MASK])[1] = ah;
             ah ^= tweak1_2;
 
-            ((uint64_t*)&l[idx & MASK])[1] ^= ((uint64_t*)&l[idx & MASK])[0];
-
             ah ^= ch;
             al ^= cl;
             idx = al;
@@ -1185,13 +1268,82 @@ public:
 
         cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+    }
+
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l;
+        uint64_t* h;
+        uint64_t al;
+        uint64_t ah;
+        __m128i bx;
+        uint64_t idx;
+
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
+
+        uint64_t tweak1_2 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+
+        al = h[0] ^ h[4];
+        ah = h[1] ^ h[5];
+        bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
+        idx = h[0] ^ h[4];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx;
+
+            if (SOFT_AES) {
+                cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al));
+            } else {
+                cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+                cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al));
+            }
+
+            _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
+            const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[idx & MASK])[11];
+            static const uint32_t table = 0x75310;
+            const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            idx = EXTRACT64(cx);
+            bx = cx;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l[idx & MASK])[0];
+            ch = ((uint64_t*) &l[idx & MASK])[1];
+            lo = __umul128(idx, cl, &hi);
+
+            al += hi;
+            ah += lo;
+
+            ah ^= tweak1_2;
+            ((uint64_t*) &l[idx & MASK])[0] = al;
+            ((uint64_t*) &l[idx & MASK])[1] = ah;
+            ah ^= tweak1_2;
+
+            ((uint64_t*) &l[idx & MASK])[1] ^= ((uint64_t*) &l[idx & MASK])[0];
+
+            ah ^= ch;
+            al ^= cl;
+            idx = al;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        keccakf(h, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -1200,12 +1352,12 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) scratchPad[0]->state, (__m128i*) scratchPad[0]->memory);
 
-        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         al = h[0] ^ h[4];
         ah = h[1] ^ h[5];
@@ -1216,13 +1368,10 @@ public:
             __m128i cx;
 
             if (SOFT_AES) {
-                cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al));
-            }
-            else {
-                cx = _mm_load_si128((__m128i *) &l[idx & MASK]);
-#           ifndef XMRIG_ARMv7
-                cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al);
-#           endif
+                cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al));
+            } else {
+                cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+                cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al));
             }
 
             _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
@@ -1244,23 +1393,25 @@ public:
             al ^= cl;
             idx = al;
 
-            int64_t n  = ((int64_t*)&l[idx & MASK])[0];
-            int32_t d  = ((int32_t*)&l[idx & MASK])[2];
-            int64_t q = n / (d | 0x5);
+            const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t*>(&l[idx & MASK]));
+            const int64_t n = vgetq_lane_s64(x, 0);
+            const int32_t d = vgetq_lane_s32(x, 2);
+            const int64_t q = n / (d | 0x5);
+
+            ((int64_t*) &l[idx & MASK])[0] = n ^ q;
 
-            ((int64_t*)&l[idx & MASK])[0] = n ^ q;
             idx = d ^ q;
         }
 
-        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) scratchPad[0]->memory, (__m128i*) scratchPad[0]->state);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -1269,10 +1420,10 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -1285,13 +1436,10 @@ public:
             __m128i cx;
 
             if (SOFT_AES) {
-                cx = soft_aesenc((uint32_t*)&l[idx & MASK], _mm_set_epi64x(ah, al));
-            }
-            else {
-                cx = _mm_load_si128((__m128i *) &l[idx & MASK]);
-#           ifndef XMRIG_ARMv7
-                cx = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah, al);
-#           endif
+                cx = soft_aesenc((uint32_t*) &l[idx & MASK], _mm_set_epi64x(ah, al));
+            } else {
+                cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+                cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah, al));
             }
 
             _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
@@ -1313,17 +1461,122 @@ public:
             al ^= cl;
             idx = al;
 
-            int64_t n  = ((int64_t*)&l[idx & MASK])[0];
-            int32_t d  = ((int32_t*)&l[idx & MASK])[2];
-            int64_t q = n / (d | 0x5);
+            const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t*>(&l[idx & MASK]));
+            const int64_t n = vgetq_lane_s64(x, 0);
+            const int32_t d = vgetq_lane_s32(x, 2);
+            const int64_t q = n / (d | 0x5);
+
+            ((int64_t*) &l[idx & MASK])[0] = n ^ q;
 
-            ((int64_t*)&l[idx & MASK])[0] = n ^ q;
             idx = (~d) ^ q;
         }
 
         cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+    }
+
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l;
+        uint64_t* h;
+        uint64_t al;
+        uint64_t ah;
+        __m128i bx;
+        uint64_t idx;
+
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
+
+        uint64_t tweak1_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+
+        al = h[0] ^ h[4];
+        ah = h[1] ^ h[5];
+        bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
+        idx = h[0] ^ h[4];
+
+        union alignas(16)
+        {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+
+            const __m128i& key = _mm_set_epi64x(ah, al);
+
+            _mm_store_si128((__m128i*) k, key);
+            cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*) x, cx);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                    saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                    saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                    saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                    saes_table[3][BYTE(x[2], 3)];
+
+            cx = _mm_load_si128((__m128i*) k);
+
+            _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
+            const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[idx & MASK])[11];
+            static const uint32_t table = 0x75310;
+            const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx = EXTRACT64(cx);
+            bx = cx;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l[idx & MASK])[0];
+            ch = ((uint64_t*) &l[idx & MASK])[1];
+            lo = __umul128(idx, cl, &hi);
+
+            al += hi;
+            ah += lo;
+
+            ah ^= tweak1_2;
+            ((uint64_t*) &l[idx & MASK])[0] = al;
+            ((uint64_t*) &l[idx & MASK])[1] = ah;
+            ah ^= tweak1_2;
+
+            ((uint64_t*) &l[idx & MASK])[1] ^= ((uint64_t*) &l[idx & MASK])[0];
+
+            ah ^= ch;
+            al ^= cl;
+            idx = al;
+
+            const int64x2_t x = vld1q_s64(reinterpret_cast<const int64_t*>(&l[idx & MASK]));
+            const int64_t n = vgetq_lane_s64(x, 0);
+            const int32_t d = vgetq_lane_s32(x, 2);
+            const int64_t q = n / (d | 0x5);
+
+            ((int64_t*) &l[idx & MASK])[0] = n ^ q;
+
+            idx = d ^ q;
+        }
+#undef BYTE
+
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        keccakf(h, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 };
 
@@ -1333,16 +1586,16 @@ class CryptoNightMultiHash<ITERATIONS, INDEX_SHIFT, MEM, MASK, SOFT_AES, 2>
 public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1363,16 +1616,14 @@ public:
             __m128i cx1;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
             } else {
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
 
-# 			ifndef XMRIG_ARMv7
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -1420,137 +1671,27 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
-                          size_t size,
-                          uint8_t *__restrict__ output,
-                          cryptonight_ctx* __restrict__ ctx)
-  {
-    keccak(input, (int) size, ctx->state[0], 200);
-    keccak(input + size, (int) size, ctx->state[1], 200);
-
-        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEM;
-    uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-    uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
-
-    uint64_t al0 = h0[0] ^h0[4];
-    uint64_t al1 = h1[0] ^h1[4];
-    uint64_t ah0 = h0[1] ^h0[5];
-    uint64_t ah1 = h1[1] ^h1[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-
-    uint64_t idx0 = h0[0] ^h0[4];
-    uint64_t idx1 = h1[0] ^h1[4];
-
-    for (size_t i = 0; i < ITERATIONS; i++) {
-      __m128i cx0;
-      __m128i cx1;
-
-      if (SOFT_AES) {
-        cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-        cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-      } else {
-        cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
-        cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
-
-# 			ifndef XMRIG_ARMv7
-        cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-        cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-#           endif
-      }
-
-      _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
-      _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
-
-            static const uint32_t table = 0x75310;
-            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
-            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
-            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-
-      idx0 = EXTRACT64(cx0);
-      idx1 = EXTRACT64(cx1);
-
-      bx0 = cx0;
-      bx1 = cx1;
-
-      uint64_t hi, lo, cl, ch;
-      cl = ((uint64_t*) &l0[idx0 & MASK])[0];
-      ch = ((uint64_t*) &l0[idx0 & MASK])[1];
-      lo = __umul128(idx0, cl, &hi);
-
-      al0 += hi;
-      ah0 += lo;
-
-            ah0 ^= tweak1_2_0;
-            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
-            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
-            ah0 ^= tweak1_2_0;
-
-      ah0 ^= ch;
-      al0 ^= cl;
-      idx0 = al0;
-
-      cl = ((uint64_t*) &l1[idx1 & MASK])[0];
-      ch = ((uint64_t*) &l1[idx1 & MASK])[1];
-      lo = __umul128(idx1, cl, &hi);
-
-      al1 += hi;
-      ah1 += lo;
-
-            ah1 ^= tweak1_2_1;
-            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
-            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
-            ah1 ^= tweak1_2_1;
-
-      ah1 ^= ch;
-      al1 ^= cl;
-      idx1 = al1;
-    }
-
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-
-    extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-    extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-  }
-
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
                                  size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1571,16 +1712,14 @@ public:
             __m128i cx1;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
             } else {
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
 
-# 			ifndef XMRIG_ARMv7
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -1589,10 +1728,10 @@ public:
             static const uint32_t table = 0x75310;
             uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
             uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
 
             idx0 = EXTRACT64(cx0);
             idx1 = EXTRACT64(cx1);
@@ -1613,7 +1752,115 @@ public:
             ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
             ah0 ^= tweak1_2_0;
 
-            ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0];
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+    }
+
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+            }
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+
+            bx0 = cx0;
+            bx1 = cx1;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0];
 
             ah0 ^= ch;
             al0 ^= cl;
@@ -1631,7 +1878,7 @@ public:
             ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
             ah1 ^= tweak1_2_1;
 
-            ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0];
+            ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0];
 
             ah1 ^= ch;
             al1 ^= cl;
@@ -1644,22 +1891,22 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1680,16 +1927,14 @@ public:
             __m128i cx1;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
             } else {
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
 
-# 			ifndef XMRIG_ARMv7
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -1716,12 +1961,15 @@ public:
             al0 ^= cl;
             idx0 = al0;
 
-            int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-            int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-            int64_t q = n / (d | 0x5);
+            const int64x2_t x0 = vld1q_s64(reinterpret_cast<const int64_t*>(&l0[idx0 & MASK]));
+            const int64_t n0 = vgetq_lane_s64(x0, 0);
+            const int32_t d0 = vgetq_lane_s32(x0, 2);
+            const int64_t q0 = n0 / (d0 | 0x5);
+
+            ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0;
+
+            idx0 = d0 ^ q0;
 
-            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-            idx0 = d ^ q;
 
             cl = ((uint64_t*) &l1[idx1 & MASK])[0];
             ch = ((uint64_t*) &l1[idx1 & MASK])[1];
@@ -1737,12 +1985,14 @@ public:
             al1 ^= cl;
             idx1 = al1;
 
-            n  = ((int64_t*)&l1[idx1 & MASK])[0];
-            d  = ((int32_t*)&l1[idx1 & MASK])[2];
-            q = n / (d | 0x5);
+            const int64x2_t x1 = vld1q_s64(reinterpret_cast<const int64_t*>(&l1[idx1 & MASK]));
+            const int64_t n1 = vgetq_lane_s64(x1, 0);
+            const int32_t d1 = vgetq_lane_s32(x1, 2);
+            const int64_t q1 = n1 / (d1 | 0x5);
 
-            ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-            idx1 = d ^ q;
+            ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1;
+
+            idx1 = d1 ^ q1;
         }
 
         cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
@@ -1751,22 +2001,22 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1787,16 +2037,14 @@ public:
             __m128i cx1;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
             } else {
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
 
-# 			ifndef XMRIG_ARMv7
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -1823,12 +2071,14 @@ public:
             al0 ^= cl;
             idx0 = al0;
 
-            int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-            int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-            int64_t q = n / (d | 0x5);
+            const int64x2_t x0 = vld1q_s64(reinterpret_cast<const int64_t*>(&l0[idx0 & MASK]));
+            const int64_t n0 = vgetq_lane_s64(x0, 0);
+            const int32_t d0 = vgetq_lane_s32(x0, 2);
+            const int64_t q0 = n0 / (d0 | 0x5);
 
-            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-            idx0 = (~d) ^ q;
+            ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0;
+
+            idx0 = (~d0) ^ q0;
 
             cl = ((uint64_t*) &l1[idx1 & MASK])[0];
             ch = ((uint64_t*) &l1[idx1 & MASK])[1];
@@ -1844,12 +2094,14 @@ public:
             al1 ^= cl;
             idx1 = al1;
 
-            n  = ((int64_t*)&l1[idx1 & MASK])[0];
-            d  = ((int32_t*)&l1[idx1 & MASK])[2];
-            q = n / (d | 0x5);
+            const int64x2_t x1 = vld1q_s64(reinterpret_cast<const int64_t*>(&l1[idx1 & MASK]));
+            const int64_t n1 = vgetq_lane_s64(x1, 0);
+            const int32_t d1 = vgetq_lane_s32(x1, 2);
+            const int64_t q1 = n1 / (d1 | 0x5);
 
-            ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-            idx1 = (~d) ^ q;
+            ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1;
+
+            idx1 = (~d1) ^ q1;
         }
 
         cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
@@ -1858,8 +2110,178 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+
+        union alignas(16)
+        {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+            __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+
+            const __m128i& key0 = _mm_set_epi64x(ah0, al0);
+
+            _mm_store_si128((__m128i*) k, key0);
+            cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*) x, cx0);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                    saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                    saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                    saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                    saes_table[3][BYTE(x[2], 3)];
+
+            cx0 = _mm_load_si128((__m128i*) k);
+
+            const __m128i& key1 = _mm_set_epi64x(ah1, al1);
+
+            _mm_store_si128((__m128i*) k, key1);
+            cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*) x, cx1);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                    saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                    saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                    saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                    saes_table[3][BYTE(x[2], 3)];
+
+            cx1 = _mm_load_si128((__m128i*) k);
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+
+            bx0 = cx0;
+            bx1 = cx1;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0];
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+
+            const int64x2_t x0 = vld1q_s64(reinterpret_cast<const int64_t*>(&l0[idx0 & MASK]));
+            const int64_t n0 = vgetq_lane_s64(x0, 0);
+            const int32_t d0 = vgetq_lane_s32(x0, 2);
+            const int64_t q0 = n0 / (d0 | 0x5);
+
+            ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0;
+
+            idx0 = d0 ^ q0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0];
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            const int64x2_t x1 = vld1q_s64(reinterpret_cast<const int64_t*>(&l1[idx1 & MASK]));
+            const int64_t n1 = vgetq_lane_s64(x1, 0);
+            const int32_t d1 = vgetq_lane_s32(x1, 2);
+            const int64_t q1 = n1 / (d1 | 0x5);
+
+            ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1;
+
+            idx1 = d1 ^ q1;
+        }
+#undef BYTE
+
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 };
 
@@ -1869,19 +2291,19 @@ class CryptoNightMultiHash<ITERATIONS, INDEX_SHIFT, MEM, MASK, SOFT_AES, 3>
 public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1902,25 +2324,23 @@ public:
         uint64_t idx1 = h1[0] ^h1[4];
         uint64_t idx2 = h2[0] ^h2[4];
 
-    for (size_t i = 0; i < ITERATIONS; i++) {
-        __m128i cx0;
-        __m128i cx1;
-        __m128i cx2;
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
 
-        if (SOFT_AES) {
-            cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-            cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-            cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-        }
-        else {
-            cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
-            cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
-            cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]);
-#           ifndef XMRIG_ARMv7
-            cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-            cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-            cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-#           endif
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -1990,185 +2410,33 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
-                          size_t size,
-                          uint8_t *__restrict__ output,
-                          cryptonight_ctx* __restrict__ ctx)
-  {
-    keccak(input, (int) size, ctx->state[0], 200);
-    keccak(input + size, (int) size, ctx->state[1], 200);
-    keccak(input + 2 * size, (int) size, ctx->state[2], 200);
-
-      uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-      uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
-      uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEM;
-    const uint8_t* l2 = ctx->memory + 2 * MEM;
-    uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-    uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-    uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
-
-    uint64_t al0 = h0[0] ^h0[4];
-    uint64_t al1 = h1[0] ^h1[4];
-    uint64_t al2 = h2[0] ^h2[4];
-    uint64_t ah0 = h0[1] ^h0[5];
-    uint64_t ah1 = h1[1] ^h1[5];
-    uint64_t ah2 = h2[1] ^h2[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-    __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-
-    uint64_t idx0 = h0[0] ^h0[4];
-    uint64_t idx1 = h1[0] ^h1[4];
-    uint64_t idx2 = h2[0] ^h2[4];
-
-    for (size_t i = 0; i < ITERATIONS; i++) {
-      __m128i cx0;
-      __m128i cx1;
-      __m128i cx2;
-
-      if (SOFT_AES) {
-        cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-        cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-        cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-      }
-      else {
-        cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
-        cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
-        cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]);
-#           ifndef XMRIG_ARMv7
-        cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-        cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-        cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-#           endif
-      }
-
-      _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
-      _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
-      _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
-
-          static const uint32_t table = 0x75310;
-          uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
-          uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-
-      idx0 = EXTRACT64(cx0);
-      idx1 = EXTRACT64(cx1);
-      idx2 = EXTRACT64(cx2);
-
-      bx0 = cx0;
-      bx1 = cx1;
-      bx2 = cx2;
-
-
-      uint64_t hi, lo, cl, ch;
-      cl = ((uint64_t*) &l0[idx0 & MASK])[0];
-      ch = ((uint64_t*) &l0[idx0 & MASK])[1];
-      lo = __umul128(idx0, cl, &hi);
-
-      al0 += hi;
-      ah0 += lo;
-
-          ah0 ^= tweak1_2_0;
-          ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
-          ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
-          ah0 ^= tweak1_2_0;
-
-      ah0 ^= ch;
-      al0 ^= cl;
-      idx0 = al0;
-
-
-      cl = ((uint64_t*) &l1[idx1 & MASK])[0];
-      ch = ((uint64_t*) &l1[idx1 & MASK])[1];
-      lo = __umul128(idx1, cl, &hi);
-
-      al1 += hi;
-      ah1 += lo;
-
-          ah1 ^= tweak1_2_1;
-          ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
-          ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
-          ah1 ^= tweak1_2_1;
-
-      ah1 ^= ch;
-      al1 ^= cl;
-      idx1 = al1;
-
-
-      cl = ((uint64_t*) &l2[idx2 & MASK])[0];
-      ch = ((uint64_t*) &l2[idx2 & MASK])[1];
-      lo = __umul128(idx2, cl, &hi);
-
-      al2 += hi;
-      ah2 += lo;
-
-          ah2 ^= tweak1_2_2;
-          ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
-          ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
-          ah2 ^= tweak1_2_2;
-
-      ah2 ^= ch;
-      al2 ^= cl;
-      idx2 = al2;
-    }
-
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-    keccakf(h2, 24);
-
-    extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-    extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-    extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-  }
-
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
                                  size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
         uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2195,19 +2463,17 @@ public:
             __m128i cx2;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-            }
-            else {
-                cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
-                cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
-                cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]);
-#           ifndef XMRIG_ARMv7
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-                cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-#           endif
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -2217,13 +2483,13 @@ public:
             static const uint32_t table = 0x75310;
             uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
             uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
 
             idx0 = EXTRACT64(cx0);
             idx1 = EXTRACT64(cx1);
@@ -2247,8 +2513,6 @@ public:
             ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
             ah0 ^= tweak1_2_0;
 
-            ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0];
-
             ah0 ^= ch;
             al0 ^= cl;
             idx0 = al0;
@@ -2266,8 +2530,6 @@ public:
             ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
             ah1 ^= tweak1_2_1;
 
-            ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0];
-
             ah1 ^= ch;
             al1 ^= cl;
             idx1 = al1;
@@ -2285,8 +2547,6 @@ public:
             ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
             ah2 ^= tweak1_2_2;
 
-            ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0];
-
             ah2 ^= ch;
             al2 ^= cl;
             idx2 = al2;
@@ -2300,26 +2560,182 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+    }
+
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+            }
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
+
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0];
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0];
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
+
+            ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0];
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2346,19 +2762,17 @@ public:
             __m128i cx2;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-            }
-            else {
-                cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
-                cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
-                cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]);
-#           ifndef XMRIG_ARMv7
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-                cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-#           endif
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -2389,13 +2803,14 @@ public:
             al0 ^= cl;
             idx0 = al0;
 
-            int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-            int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-            int64_t q = n / (d | 0x5);
+            const int64x2_t x0 = vld1q_s64(reinterpret_cast<const int64_t*>(&l0[idx0 & MASK]));
+            const int64_t n0 = vgetq_lane_s64(x0, 0);
+            const int32_t d0 = vgetq_lane_s32(x0, 2);
+            const int64_t q0 = n0 / (d0 | 0x5);
 
-            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-            idx0 = d ^ q;
+            ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0;
 
+            idx0 = d0 ^ q0;
 
             cl = ((uint64_t*) &l1[idx1 & MASK])[0];
             ch = ((uint64_t*) &l1[idx1 & MASK])[1];
@@ -2411,12 +2826,14 @@ public:
             al1 ^= cl;
             idx1 = al1;
 
-            n  = ((int64_t*)&l1[idx1 & MASK])[0];
-            d  = ((int32_t*)&l1[idx1 & MASK])[2];
-            q = n / (d | 0x5);
+            const int64x2_t x1 = vld1q_s64(reinterpret_cast<const int64_t*>(&l1[idx1 & MASK]));
+            const int64_t n1 = vgetq_lane_s64(x1, 0);
+            const int32_t d1 = vgetq_lane_s32(x1, 2);
+            const int64_t q1 = n1 / (d1 | 0x5);
 
-            ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-            idx1 = d ^ q;
+            ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1;
+
+            idx1 = d1 ^ q1;
 
 
             cl = ((uint64_t*) &l2[idx2 & MASK])[0];
@@ -2433,12 +2850,15 @@ public:
             al2 ^= cl;
             idx2 = al2;
 
-            n  = ((int64_t*)&l2[idx2 & MASK])[0];
-            d  = ((int32_t*)&l2[idx2 & MASK])[2];
-            q = n / (d | 0x5);
 
-            ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q;
-            idx2 = d ^ q;
+            const int64x2_t x2 = vld1q_s64(reinterpret_cast<const int64_t*>(&l2[idx2 & MASK]));
+            const int64_t n2 = vgetq_lane_s64(x2, 0);
+            const int32_t d2 = vgetq_lane_s32(x2, 2);
+            const int64_t q2 = n2 / (d2 | 0x5);
+
+            ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2;
+
+            idx2 = d2 ^ q2;
         }
 
         cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
@@ -2449,26 +2869,26 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2495,19 +2915,17 @@ public:
             __m128i cx2;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-            }
-            else {
-                cx0 = _mm_load_si128((__m128i *) &l0[idx0 & MASK]);
-                cx1 = _mm_load_si128((__m128i *) &l1[idx1 & MASK]);
-                cx2 = _mm_load_si128((__m128i *) &l2[idx2 & MASK]);
-#           ifndef XMRIG_ARMv7
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-                cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-#           endif
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -2538,12 +2956,14 @@ public:
             al0 ^= cl;
             idx0 = al0;
 
-            int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
-            int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
-            int64_t q = n / (d | 0x5);
+            const int64x2_t x0 = vld1q_s64(reinterpret_cast<const int64_t*>(&l0[idx0 & MASK]));
+            const int64_t n0 = vgetq_lane_s64(x0, 0);
+            const int32_t d0 = vgetq_lane_s32(x0, 2);
+            const int64_t q0 = n0 / (d0 | 0x5);
 
-            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
-            idx0 = (~d) ^ q;
+            ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0;
+
+            idx0 = (~d0) ^ q0;
 
 
             cl = ((uint64_t*) &l1[idx1 & MASK])[0];
@@ -2560,13 +2980,14 @@ public:
             al1 ^= cl;
             idx1 = al1;
 
-            n  = ((int64_t*)&l1[idx1 & MASK])[0];
-            d  = ((int32_t*)&l1[idx1 & MASK])[2];
-            q = n / (d | 0x5);
+            const int64x2_t x1 = vld1q_s64(reinterpret_cast<const int64_t*>(&l1[idx1 & MASK]));
+            const int64_t n1 = vgetq_lane_s64(x1, 0);
+            const int32_t d1 = vgetq_lane_s32(x1, 2);
+            const int64_t q1 = n1 / (d1 | 0x5);
 
-            ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
-            idx1 = (~d) ^ q;
+            ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1;
 
+            idx1 = (~d1) ^ q1;
 
             cl = ((uint64_t*) &l2[idx2 & MASK])[0];
             ch = ((uint64_t*) &l2[idx2 & MASK])[1];
@@ -2582,12 +3003,14 @@ public:
             al2 ^= cl;
             idx2 = al2;
 
-            n  = ((int64_t*)&l2[idx2 & MASK])[0];
-            d  = ((int32_t*)&l2[idx2 & MASK])[2];
-            q = n / (d | 0x5);
+            const int64x2_t x2 = vld1q_s64(reinterpret_cast<const int64_t*>(&l2[idx2 & MASK]));
+            const int64_t n2 = vgetq_lane_s64(x2, 0);
+            const int32_t d2 = vgetq_lane_s32(x2, 2);
+            const int64_t q2 = n2 / (d2 | 0x5);
 
-            ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q;
-            idx2 = (~d) ^ q;
+            ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2;
+
+            idx2 = (~d2) ^ q2;
         }
 
         cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
@@ -2598,9 +3021,246 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+
+        union alignas(16)
+        {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+            __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+            __m128i cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+            const __m128i& key0 = _mm_set_epi64x(ah0, al0);
+
+            _mm_store_si128((__m128i*) k, key0);
+            cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*) x, cx0);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                    saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                    saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                    saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                    saes_table[3][BYTE(x[2], 3)];
+
+            cx0 = _mm_load_si128((__m128i*) k);
+
+            const __m128i& key1 = _mm_set_epi64x(ah1, al1);
+
+            _mm_store_si128((__m128i*) k, key1);
+            cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*) x, cx1);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                    saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                    saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                    saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                    saes_table[3][BYTE(x[2], 3)];
+
+            cx1 = _mm_load_si128((__m128i*) k);
+
+            const __m128i& key2 = _mm_set_epi64x(ah2, al2);
+
+            _mm_store_si128((__m128i*) k, key2);
+            cx2 = _mm_xor_si128(cx2, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*) x, cx2);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                    saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                    saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                    saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                    saes_table[3][BYTE(x[2], 3)];
+
+            cx2 = _mm_load_si128((__m128i*) k);
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0];
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            const int64x2_t x0 = vld1q_s64(reinterpret_cast<const int64_t*>(&l0[idx0 & MASK]));
+            const int64_t n0 = vgetq_lane_s64(x0, 0);
+            const int32_t d0 = vgetq_lane_s32(x0, 2);
+            const int64_t q0 = n0 / (d0 | 0x5);
+
+            ((int64_t*) &l0[idx0 & MASK])[0] = n0 ^ q0;
+
+            idx0 = d0 ^ q0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0];
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            const int64x2_t x1 = vld1q_s64(reinterpret_cast<const int64_t*>(&l1[idx1 & MASK]));
+            const int64_t n1 = vgetq_lane_s64(x1, 0);
+            const int32_t d1 = vgetq_lane_s32(x1, 2);
+            const int64_t q1 = n1 / (d1 | 0x5);
+
+            ((int64_t*) &l1[idx1 & MASK])[0] = n1 ^ q1;
+
+            idx1 = d1 ^ q1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
+
+            ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0];
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            const int64x2_t x2 = vld1q_s64(reinterpret_cast<const int64_t*>(&l2[idx2 & MASK]));
+            const int64_t n2 = vgetq_lane_s64(x2, 0);
+            const int32_t d2 = vgetq_lane_s32(x2, 2);
+            const int64_t q2 = n2 / (d2 | 0x5);
+
+            ((int64_t*) &l2[idx2 & MASK])[0] = n2 ^ q2;
+
+            idx2 = d2 ^ q2;
+        }
+#undef BYTE
+
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 };
 
@@ -2610,22 +3270,22 @@ class CryptoNightMultiHash<ITERATIONS, INDEX_SHIFT, MEM, MASK, SOFT_AES, 4>
 public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak(input + 3 * size, (int) size, ctx->state[3], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2658,22 +3318,20 @@ public:
             __m128i cx3;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
             } else {
-# 			ifndef XMRIG_ARMv7
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
                 cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
                 cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
 
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-                cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-                cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -2763,231 +3421,39 @@ public:
         keccakf(h2, 24);
         keccakf(h3, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
     }
 
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
-                          size_t size,
-                          uint8_t *__restrict__ output,
-                          cryptonight_ctx* __restrict__ ctx)
-  {
-    keccak(input, (int) size, ctx->state[0], 200);
-    keccak(input + size, (int) size, ctx->state[1], 200);
-    keccak(input + 2 * size, (int) size, ctx->state[2], 200);
-    keccak(input + 3 * size, (int) size, ctx->state[3], 200);
-
-      uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-      uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
-      uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
-      uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(input + 35 + 3 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEM;
-    const uint8_t* l2 = ctx->memory + 2 * MEM;
-    const uint8_t* l3 = ctx->memory + 3 * MEM;
-    uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-    uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-    uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-    uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
-
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
-
-    uint64_t al0 = h0[0] ^h0[4];
-    uint64_t al1 = h1[0] ^h1[4];
-    uint64_t al2 = h2[0] ^h2[4];
-    uint64_t al3 = h3[0] ^h3[4];
-    uint64_t ah0 = h0[1] ^h0[5];
-    uint64_t ah1 = h1[1] ^h1[5];
-    uint64_t ah2 = h2[1] ^h2[5];
-    uint64_t ah3 = h3[1] ^h3[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-    __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-    __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-
-    uint64_t idx0 = h0[0] ^h0[4];
-    uint64_t idx1 = h1[0] ^h1[4];
-    uint64_t idx2 = h2[0] ^h2[4];
-    uint64_t idx3 = h3[0] ^h3[4];
-
-    for (size_t i = 0; i < ITERATIONS; i++) {
-      __m128i cx0;
-      __m128i cx1;
-      __m128i cx2;
-      __m128i cx3;
-
-      if (SOFT_AES) {
-        cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-        cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-        cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-        cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
-      } else {
-# 			ifndef XMRIG_ARMv7
-        cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
-        cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
-        cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
-        cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
-
-        cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-        cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-        cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-        cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3);
-#           endif
-      }
-
-      _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
-      _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
-      _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
-      _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
-
-          static const uint32_t table = 0x75310;
-          uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
-          uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-
-      idx0 = EXTRACT64(cx0);
-      idx1 = EXTRACT64(cx1);
-      idx2 = EXTRACT64(cx2);
-      idx3 = EXTRACT64(cx3);
-
-      bx0 = cx0;
-      bx1 = cx1;
-      bx2 = cx2;
-      bx3 = cx3;
-
-
-      uint64_t hi, lo, cl, ch;
-      cl = ((uint64_t*) &l0[idx0 & MASK])[0];
-      ch = ((uint64_t*) &l0[idx0 & MASK])[1];
-      lo = __umul128(idx0, cl, &hi);
-
-      al0 += hi;
-      ah0 += lo;
-
-          ah0 ^= tweak1_2_0;
-          ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
-          ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
-          ah0 ^= tweak1_2_0;
-
-      ah0 ^= ch;
-      al0 ^= cl;
-      idx0 = al0;
-
-
-      cl = ((uint64_t*) &l1[idx1 & MASK])[0];
-      ch = ((uint64_t*) &l1[idx1 & MASK])[1];
-      lo = __umul128(idx1, cl, &hi);
-
-      al1 += hi;
-      ah1 += lo;
-
-          ah1 ^= tweak1_2_1;
-          ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
-          ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
-          ah1 ^= tweak1_2_1;
-
-      ah1 ^= ch;
-      al1 ^= cl;
-      idx1 = al1;
-
-
-      cl = ((uint64_t*) &l2[idx2 & MASK])[0];
-      ch = ((uint64_t*) &l2[idx2 & MASK])[1];
-      lo = __umul128(idx2, cl, &hi);
-
-      al2 += hi;
-      ah2 += lo;
-
-          ah2 ^= tweak1_2_2;
-          ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
-          ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
-          ah2 ^= tweak1_2_2;
-
-      ah2 ^= ch;
-      al2 ^= cl;
-      idx2 = al2;
-
-
-      cl = ((uint64_t*) &l3[idx3 & MASK])[0];
-      ch = ((uint64_t*) &l3[idx3 & MASK])[1];
-      lo = __umul128(idx3, cl, &hi);
-
-      al3 += hi;
-      ah3 += lo;
-
-          ah3 ^= tweak1_2_3;
-          ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
-          ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
-          ah3 ^= tweak1_2_3;
-
-      ah3 ^= ch;
-      al3 ^= cl;
-      idx3 = al3;
-    }
-
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-    keccakf(h2, 24);
-    keccakf(h3, 24);
-
-    extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-    extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-    extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-    extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
-  }
-
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
                                  size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak(input + 3 * size, (int) size, ctx->state[3], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
         uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
         uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(input + 35 + 3 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -3020,22 +3486,20 @@ public:
             __m128i cx3;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
             } else {
-# 			ifndef XMRIG_ARMv7
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
                 cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
                 cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
 
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-                cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-                cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -3046,16 +3510,16 @@ public:
             static const uint32_t table = 0x75310;
             uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
             uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
 
             idx0 = EXTRACT64(cx0);
             idx1 = EXTRACT64(cx1);
@@ -3081,8 +3545,6 @@ public:
             ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
             ah0 ^= tweak1_2_0;
 
-            ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0];
-
             ah0 ^= ch;
             al0 ^= cl;
             idx0 = al0;
@@ -3100,8 +3562,6 @@ public:
             ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
             ah1 ^= tweak1_2_1;
 
-            ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0];
-
             ah1 ^= ch;
             al1 ^= cl;
             idx1 = al1;
@@ -3119,7 +3579,201 @@ public:
             ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
             ah2 ^= tweak1_2_2;
 
-            ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0];
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
+            lo = __umul128(idx3, cl, &hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ah3 ^= tweak1_2_3;
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+            ah3 ^= tweak1_2_3;
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+    }
+
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+        uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(input + 35 + 3 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
+            }
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
+            bx3 = cx3;
+
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0];
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0];
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
+
+            ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0];
 
             ah2 ^= ch;
             al2 ^= cl;
@@ -3138,7 +3792,7 @@ public:
             ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
             ah3 ^= tweak1_2_3;
 
-            ((uint64_t*)&l3[idx3 & MASK])[1] ^= ((uint64_t*)&l3[idx3 & MASK])[0];
+            ((uint64_t*) &l3[idx3 & MASK])[1] ^= ((uint64_t*) &l3[idx3 & MASK])[0];
 
             ah3 ^= ch;
             al3 ^= cl;
@@ -3155,24 +3809,32 @@ public:
         keccakf(h2, 24);
         keccakf(h3, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
+    {
+        // not supported
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
@@ -3180,29 +3842,29 @@ public:
 
 template<size_t ITERATIONS, size_t INDEX_SHIFT, size_t MEM, size_t MASK, bool SOFT_AES>
 class CryptoNightMultiHash<ITERATIONS, INDEX_SHIFT, MEM, MASK, SOFT_AES, 5>
-{
+{//
 public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
-                            uint8_t *__restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak(input + 3 * size, (int) size, ctx->state[3], 200);
-        keccak(input + 4 * size, (int) size, ctx->state[4], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        const uint8_t* l4 = ctx->memory + 4 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
-        uint64_t* h4 = reinterpret_cast<uint64_t*>(ctx->state[4]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -3241,25 +3903,23 @@ public:
             __m128i cx4;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
-                cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+                cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
             } else {
-# 			ifndef XMRIG_ARMv7
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
                 cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
                 cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
                 cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
 
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-                cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-                cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3);
-                cx4 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx4, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah4, al4);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
+                cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -3368,278 +4028,46 @@ public:
         keccakf(h3, 24);
         keccakf(h4, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
-        extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
     }
 
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
-                          size_t size,
-                          uint8_t *__restrict__ output,
-                          cryptonight_ctx* __restrict__ ctx)
-  {
-    keccak(input, (int) size, ctx->state[0], 200);
-    keccak(input + size, (int) size, ctx->state[1], 200);
-    keccak(input + 2 * size, (int) size, ctx->state[2], 200);
-    keccak(input + 3 * size, (int) size, ctx->state[3], 200);
-    keccak(input + 4 * size, (int) size, ctx->state[4], 200);
-
-      uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-      uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
-      uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
-      uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(input + 35 + 3 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
-      uint64_t tweak1_2_4 = (*reinterpret_cast<const uint64_t*>(input + 35 + 4 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[4]) + 24));
-
-
-    const uint8_t* l0 = ctx->memory;
-    const uint8_t* l1 = ctx->memory + MEM;
-    const uint8_t* l2 = ctx->memory + 2 * MEM;
-    const uint8_t* l3 = ctx->memory + 3 * MEM;
-    const uint8_t* l4 = ctx->memory + 4 * MEM;
-    uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-    uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-    uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-    uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
-    uint64_t* h4 = reinterpret_cast<uint64_t*>(ctx->state[4]);
-
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
-    cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h4, (__m128i*) l4);
-
-    uint64_t al0 = h0[0] ^h0[4];
-    uint64_t al1 = h1[0] ^h1[4];
-    uint64_t al2 = h2[0] ^h2[4];
-    uint64_t al3 = h3[0] ^h3[4];
-    uint64_t al4 = h4[0] ^h4[4];
-    uint64_t ah0 = h0[1] ^h0[5];
-    uint64_t ah1 = h1[1] ^h1[5];
-    uint64_t ah2 = h2[1] ^h2[5];
-    uint64_t ah3 = h3[1] ^h3[5];
-    uint64_t ah4 = h4[1] ^h4[5];
-
-    __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-    __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-    __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-    __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-    __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
-
-    uint64_t idx0 = h0[0] ^h0[4];
-    uint64_t idx1 = h1[0] ^h1[4];
-    uint64_t idx2 = h2[0] ^h2[4];
-    uint64_t idx3 = h3[0] ^h3[4];
-    uint64_t idx4 = h4[0] ^h4[4];
-
-    for (size_t i = 0; i < ITERATIONS; i++) {
-      __m128i cx0;
-      __m128i cx1;
-      __m128i cx2;
-      __m128i cx3;
-      __m128i cx4;
-
-      if (SOFT_AES) {
-        cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-        cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-        cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-        cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
-        cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
-      } else {
-# 			ifndef XMRIG_ARMv7
-        cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
-        cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
-        cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
-        cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
-        cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
-
-        cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-        cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-        cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-        cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3);
-        cx4 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx4, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah4, al4);
-#           endif
-      }
-
-      _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
-      _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
-      _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
-      _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
-      _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4));
-
-          static const uint32_t table = 0x75310;
-          uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
-          uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l4[idx4 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-
-      idx0 = EXTRACT64(cx0);
-      idx1 = EXTRACT64(cx1);
-      idx2 = EXTRACT64(cx2);
-      idx3 = EXTRACT64(cx3);
-      idx4 = EXTRACT64(cx4);
-
-      bx0 = cx0;
-      bx1 = cx1;
-      bx2 = cx2;
-      bx3 = cx3;
-      bx4 = cx4;
-
-      uint64_t hi, lo, cl, ch;
-      cl = ((uint64_t*) &l0[idx0 & MASK])[0];
-      ch = ((uint64_t*) &l0[idx0 & MASK])[1];
-      lo = __umul128(idx0, cl, &hi);
-
-      al0 += hi;
-      ah0 += lo;
-
-          ah0 ^= tweak1_2_0;
-          ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
-          ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
-          ah0 ^= tweak1_2_0;
-
-      ah0 ^= ch;
-      al0 ^= cl;
-      idx0 = al0;
-
-
-      cl = ((uint64_t*) &l1[idx1 & MASK])[0];
-      ch = ((uint64_t*) &l1[idx1 & MASK])[1];
-      lo = __umul128(idx1, cl, &hi);
-
-      al1 += hi;
-      ah1 += lo;
-
-          ah1 ^= tweak1_2_1;
-          ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
-          ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
-          ah1 ^= tweak1_2_1;
-
-      ah1 ^= ch;
-      al1 ^= cl;
-      idx1 = al1;
-
-
-      cl = ((uint64_t*) &l2[idx2 & MASK])[0];
-      ch = ((uint64_t*) &l2[idx2 & MASK])[1];
-      lo = __umul128(idx2, cl, &hi);
-
-      al2 += hi;
-      ah2 += lo;
-
-          ah2 ^= tweak1_2_2;
-          ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
-          ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
-          ah2 ^= tweak1_2_2;
-
-      ah2 ^= ch;
-      al2 ^= cl;
-      idx2 = al2;
-
-
-      cl = ((uint64_t*) &l3[idx3 & MASK])[0];
-      ch = ((uint64_t*) &l3[idx3 & MASK])[1];
-      lo = __umul128(idx3, cl, &hi);
-
-      al3 += hi;
-      ah3 += lo;
-
-          ah3 ^= tweak1_2_3;
-          ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
-          ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
-          ah3 ^= tweak1_2_3;
-
-      ah3 ^= ch;
-      al3 ^= cl;
-      idx3 = al3;
-
-
-      cl = ((uint64_t*) &l4[idx4 & MASK])[0];
-      ch = ((uint64_t*) &l4[idx4 & MASK])[1];
-      lo = __umul128(idx4, cl, &hi);
-
-      al4 += hi;
-      ah4 += lo;
-
-          ah4 ^= tweak1_2_4;
-          ((uint64_t*) &l4[idx4 & MASK])[0] = al4;
-          ((uint64_t*) &l4[idx4 & MASK])[1] = ah4;
-          ah4 ^= tweak1_2_4;
-
-      ah4 ^= ch;
-      al4 ^= cl;
-      idx4 = al4;
-    }
-
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
-    cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l4, (__m128i*) h4);
-
-    keccakf(h0, 24);
-    keccakf(h1, 24);
-    keccakf(h2, 24);
-    keccakf(h3, 24);
-    keccakf(h4, 24);
-
-    extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-    extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-    extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-    extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
-    extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128);
-  }
-
-    inline static void hashLiteIpbc (const uint8_t* __restrict__ input,
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
                                  size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak(input, (int) size, ctx->state[0], 200);
-        keccak(input + size, (int) size, ctx->state[1], 200);
-        keccak(input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak(input + 3 * size, (int) size, ctx->state[3], 200);
-        keccak(input + 4 * size, (int) size, ctx->state[4], 200);
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
         uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
         uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(input + 35 + 3 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
         uint64_t tweak1_2_4 = (*reinterpret_cast<const uint64_t*>(input + 35 + 4 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[4]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[4]->state) + 24));
 
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        const uint8_t* l4 = ctx->memory + 4 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
-        uint64_t* h4 = reinterpret_cast<uint64_t*>(ctx->state[4]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -3678,25 +4106,23 @@ public:
             __m128i cx4;
 
             if (SOFT_AES) {
-                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
-                cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+                cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
             } else {
-# 			ifndef XMRIG_ARMv7
                 cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
                 cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
                 cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
                 cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
                 cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
 
-                cx0 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx0, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah0, al0);
-                cx1 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx1, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah1, al1);
-                cx2 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx2, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah2, al2);
-                cx3 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx3, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah3, al3);
-                cx4 = vreinterpretq_m128i_u8(vaesmcq_u8(vaeseq_u8(cx4, vdupq_n_u8(0)))) ^ _mm_set_epi64x(ah4, al4);
-#           endif
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
+                cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4));
             }
 
             _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
@@ -3708,19 +4134,19 @@ public:
             static const uint32_t table = 0x75310;
             uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
             uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
             tmp = reinterpret_cast<const uint8_t*>(&l4[idx4 & MASK])[11];
             index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-            ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            ((uint8_t*) (&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
 
             idx0 = EXTRACT64(cx0);
             idx1 = EXTRACT64(cx1);
@@ -3747,8 +4173,6 @@ public:
             ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
             ah0 ^= tweak1_2_0;
 
-            ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0];
-
             ah0 ^= ch;
             al0 ^= cl;
             idx0 = al0;
@@ -3766,8 +4190,6 @@ public:
             ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
             ah1 ^= tweak1_2_1;
 
-            ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0];
-
             ah1 ^= ch;
             al1 ^= cl;
             idx1 = al1;
@@ -3785,8 +4207,6 @@ public:
             ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
             ah2 ^= tweak1_2_2;
 
-            ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0];
-
             ah2 ^= ch;
             al2 ^= cl;
             idx2 = al2;
@@ -3804,8 +4224,6 @@ public:
             ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
             ah3 ^= tweak1_2_3;
 
-            ((uint64_t*)&l3[idx3 & MASK])[1] ^= ((uint64_t*)&l3[idx3 & MASK])[0];
-
             ah3 ^= ch;
             al3 ^= cl;
             idx3 = al3;
@@ -3823,7 +4241,245 @@ public:
             ((uint64_t*) &l4[idx4 & MASK])[1] = ah4;
             ah4 ^= tweak1_2_4;
 
-            ((uint64_t*)&l4[idx4 & MASK])[1] ^= ((uint64_t*)&l4[idx4 & MASK])[0];
+            ah4 ^= ch;
+            al4 ^= cl;
+            idx4 = al4;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l4, (__m128i*) h4);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+        keccakf(h4, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
+    }
+
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(input + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(input + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(input + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+        uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(input + 35 + 3 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
+        uint64_t tweak1_2_4 = (*reinterpret_cast<const uint64_t*>(input + 35 + 4 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[4]->state) + 24));
+
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h4, (__m128i*) l4);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+        uint64_t al4 = h4[0] ^h4[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+        uint64_t ah4 = h4[1] ^h4[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+        __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+        uint64_t idx4 = h4[0] ^h4[4];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+            __m128i cx4;
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+                cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+                cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
+                cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4));
+            }
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
+            _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l4[idx4 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*) (&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+            idx4 = EXTRACT64(cx4);
+
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
+            bx3 = cx3;
+            bx4 = cx4;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*) &l0[idx0 & MASK])[1] ^= ((uint64_t*) &l0[idx0 & MASK])[0];
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ((uint64_t*) &l1[idx1 & MASK])[1] ^= ((uint64_t*) &l1[idx1 & MASK])[0];
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
+
+            ((uint64_t*) &l2[idx2 & MASK])[1] ^= ((uint64_t*) &l2[idx2 & MASK])[0];
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
+            lo = __umul128(idx3, cl, &hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ah3 ^= tweak1_2_3;
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+            ah3 ^= tweak1_2_3;
+
+            ((uint64_t*) &l3[idx3 & MASK])[1] ^= ((uint64_t*) &l3[idx3 & MASK])[0];
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+
+
+            cl = ((uint64_t*) &l4[idx4 & MASK])[0];
+            ch = ((uint64_t*) &l4[idx4 & MASK])[1];
+            lo = __umul128(idx4, cl, &hi);
+
+            al4 += hi;
+            ah4 += lo;
+
+            ah4 ^= tweak1_2_4;
+            ((uint64_t*) &l4[idx4 & MASK])[0] = al4;
+            ((uint64_t*) &l4[idx4 & MASK])[1] = ah4;
+            ah4 ^= tweak1_2_4;
+
+            ((uint64_t*) &l4[idx4 & MASK])[1] ^= ((uint64_t*) &l4[idx4 & MASK])[0];
 
             ah4 ^= ch;
             al4 ^= cl;
@@ -3842,25 +4498,33 @@ public:
         keccakf(h3, 24);
         keccakf(h4, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
-        extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
                                  size_t size,
-                                 uint8_t *__restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
                                       size_t size,
-                                      uint8_t *__restrict__ output,
-                                      cryptonight_ctx* __restrict__ ctx)
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
+    {
+        // not supported
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h
index 7a6f9cc3..0aee57b3 100644
--- a/src/crypto/CryptoNight_test.h
+++ b/src/crypto/CryptoNight_test.h
@@ -158,7 +158,7 @@ const static uint8_t test_output_heavy[160] = {
 };
 
 // CN-Heavy Haven
-const static uint8_t test_output_heavy_haven[160] = {
+const static uint8_t test_output_heavy_haven[96] = {
 		0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57,
 		0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6,
 		0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F,
@@ -167,4 +167,14 @@ const static uint8_t test_output_heavy_haven[160] = {
 		0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B
 };
 
+// CN-Heavy Tube
+const static uint8_t test_output_heavy_tube[96] = {
+        0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf,
+        0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35,
+        0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3,
+        0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74,
+        0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f,
+        0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb
+};
+
 #endif /* __CRYPTONIGHT_TEST_H__ */
diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h
index bfb8a122..6ac2098d 100644
--- a/src/crypto/CryptoNight_x86.h
+++ b/src/crypto/CryptoNight_x86.h
@@ -475,7 +475,7 @@ public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -485,19 +485,18 @@ public:
         uint64_t idx[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200);
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
             al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
             ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5];
-            bx[hashBlock] =
-                    _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
+            bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
             idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
         }
 
@@ -538,7 +537,7 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
                                                        output + hashBlock * 32);
         }
     }
@@ -546,7 +545,7 @@ public:
     inline static void hashPowV2(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -557,14 +556,14 @@ public:
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200);
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
             tweak1_2[hashBlock] = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + hashBlock * size) ^
-                    *(reinterpret_cast<const uint64_t*>(ctx->state[hashBlock]) + 24));
+                    *(reinterpret_cast<const uint64_t*>(scratchPad[hashBlock]->state) + 24));
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -619,15 +618,15 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
                                                        output + hashBlock * 32);
         }
     }
 
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -638,14 +637,14 @@ public:
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200);
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
             tweak1_2[hashBlock] = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + hashBlock * size) ^
-                                   *(reinterpret_cast<const uint64_t*>(ctx->state[hashBlock]) + 24));
+                                   *(reinterpret_cast<const uint64_t*>(scratchPad[hashBlock]->state) + 24));
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -704,7 +703,7 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
                                                        output + hashBlock * 32);
         }
     }
@@ -712,7 +711,7 @@ public:
     inline static void hashHeavy(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -722,12 +721,12 @@ public:
         uint64_t idx[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200);
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -781,7 +780,7 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
                                                        output + hashBlock * 32);
         }
     }
@@ -789,7 +788,7 @@ public:
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l[NUM_HASH_BLOCKS];
         uint64_t* h[NUM_HASH_BLOCKS];
@@ -799,12 +798,12 @@ public:
         uint64_t idx[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, ctx->state[hashBlock], 200);
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
         }
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-            l[hashBlock] = ctx->memory + hashBlock * MEM;
-            h[hashBlock] = reinterpret_cast<uint64_t*>(ctx->state[hashBlock]);
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
 
             cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
 
@@ -858,12 +857,122 @@ public:
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
-            extra_hashes[ctx->state[hashBlock][0] & 3](ctx->state[hashBlock], 200,
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
                                                        output + hashBlock * 32);
         }
     }
-};
 
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l[NUM_HASH_BLOCKS];
+        uint64_t* h[NUM_HASH_BLOCKS];
+        uint64_t al[NUM_HASH_BLOCKS];
+        uint64_t ah[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        uint64_t idx[NUM_HASH_BLOCKS];
+        uint64_t tweak1_2[NUM_HASH_BLOCKS];
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
+            tweak1_2[hashBlock] = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + hashBlock * size) ^
+                                   *(reinterpret_cast<const uint64_t*>(scratchPad[hashBlock]->state) + 24));
+        }
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
+
+            cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
+
+            al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+            ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5];
+            bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
+            idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+        }
+
+        union alignas(16) {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                __m128i cx;
+
+                cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+
+                const __m128i& key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
+
+                _mm_store_si128((__m128i*)k, key);
+                cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+                _mm_store_si128((__m128i*)x, cx);
+
+                k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+                x[0] ^= k[0];
+                k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+                x[1] ^= k[1];
+                k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+                x[2] ^= k[2];
+                k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+
+                cx = _mm_load_si128((__m128i*)k);
+
+                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx));
+
+                const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
+                static const uint32_t table = 0x75310;
+                const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+                ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+                idx[hashBlock] = EXTRACT64(cx);
+                bx[hashBlock] = cx;
+
+                uint64_t hi, lo, cl, ch;
+                cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
+                ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
+                lo = __umul128(idx[hashBlock], cl, &hi);
+
+                al[hashBlock] += hi;
+                ah[hashBlock] += lo;
+
+                ah[hashBlock] ^= tweak1_2[hashBlock];
+
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock];
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock];
+
+                ah[hashBlock] ^= tweak1_2[hashBlock];
+
+                ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[1] ^= ((uint64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0];
+
+                ah[hashBlock] ^= ch;
+                al[hashBlock] ^= cl;
+                idx[hashBlock] = al[hashBlock];
+
+                int64_t n  = ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0];
+                int32_t d  = ((int32_t*)&l[hashBlock][idx[hashBlock] & MASK])[2];
+                int64_t q = n / (d | 0x5);
+
+                ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
+                idx[hashBlock] = d ^ q;
+            }
+        }
+
+#undef BYTE
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
+            keccakf(h[hashBlock], 24);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
+        }
+    }
+};
 
 template<size_t ITERATIONS, size_t INDEX_SHIFT, size_t MEM, size_t MASK, bool SOFT_AES>
 class CryptoNightMultiHash<ITERATIONS, INDEX_SHIFT, MEM, MASK, SOFT_AES, 1>
@@ -872,7 +981,7 @@ public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -881,10 +990,10 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -925,13 +1034,13 @@ public:
 
         cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
     inline static void hashPowV2(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -940,12 +1049,12 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
         uint64_t tweak1_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -969,6 +1078,7 @@ public:
             static const uint32_t table = 0x75310;
             const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
             ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
             idx = EXTRACT64(cx);
             bx = cx;
 
@@ -992,13 +1102,13 @@ public:
 
         cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -1007,12 +1117,12 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
         uint64_t tweak1_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -1061,13 +1171,13 @@ public:
 
         cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -1076,10 +1186,10 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -1127,13 +1237,13 @@ public:
 
         cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
         const uint8_t* l;
         uint64_t* h;
@@ -1142,10 +1252,10 @@ public:
         __m128i bx;
         uint64_t idx;
 
-        keccak(static_cast<const uint8_t*>(input), (int) size, ctx->state[0], 200);
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
 
-        l = ctx->memory;
-        h = reinterpret_cast<uint64_t*>(ctx->state[0]);
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
 
@@ -1193,7 +1303,102 @@ public:
 
         cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
         keccakf(h, 24);
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l;
+        uint64_t* h;
+        uint64_t al;
+        uint64_t ah;
+        __m128i bx;
+        uint64_t idx;
+
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
+
+        uint64_t tweak1_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+
+        al = h[0] ^ h[4];
+        ah = h[1] ^ h[5];
+        bx = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
+        idx = h[0] ^ h[4];
+
+        union alignas(16) {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+
+            const __m128i& key = _mm_set_epi64x(ah, al);
+
+            _mm_store_si128((__m128i*)k, key);
+            cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*)x, cx);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+
+            cx = _mm_load_si128((__m128i*)k);
+
+            _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx, cx));
+            const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[idx & MASK])[11];
+            static const uint32_t table = 0x75310;
+            const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l[idx & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx = EXTRACT64(cx);
+            bx = cx;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l[idx & MASK])[0];
+            ch = ((uint64_t*) &l[idx & MASK])[1];
+            lo = __umul128(idx, cl, &hi);
+
+            al += hi;
+            ah += lo;
+
+            ah ^= tweak1_2;
+            ((uint64_t*) &l[idx & MASK])[0] = al;
+            ((uint64_t*) &l[idx & MASK])[1] = ah;
+            ah ^= tweak1_2;
+
+            ((uint64_t*)&l[idx & MASK])[1] ^= ((uint64_t*)&l[idx & MASK])[0];
+
+            ah ^= ch;
+            al ^= cl;
+            idx = al;
+
+            int64_t n  = ((int64_t*)&l[idx & MASK])[0];
+            int32_t d  = ((int32_t*)&l[idx & MASK])[2];
+            int64_t q = n / (d | 0x5);
+
+            ((int64_t*)&l[idx & MASK])[0] = n ^ q;
+            idx = d ^ q;
+        }
+#undef BYTE
+
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        keccakf(h, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 };
 
@@ -1204,15 +1409,15 @@ public:
     inline static void hash(const uint8_t* __restrict__ input,
                           size_t size,
                           uint8_t* __restrict__ output,
-                          cryptonight_ctx* __restrict__ ctx)
+                          ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1288,27 +1493,27 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
     inline static void hashPowV2(const uint8_t* __restrict__ input,
                               size_t size,
                               uint8_t* __restrict__ output,
-                              cryptonight_ctx* __restrict__ ctx)
+                              ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1396,27 +1601,27 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1508,22 +1713,22 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1614,22 +1819,22 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1720,8 +1925,164 @@ public:
         keccakf(h0, 24);
         keccakf(h1, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+
+        union alignas(16) {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+            __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+
+            const __m128i& key0 = _mm_set_epi64x(ah0, al0);
+
+            _mm_store_si128((__m128i*)k, key0);
+            cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*)x, cx0);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+
+            cx0 = _mm_load_si128((__m128i*)k);
+
+            const __m128i& key1 = _mm_set_epi64x(ah1, al1);
+
+            _mm_store_si128((__m128i*)k, key1);
+            cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*)x, cx1);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+
+            cx1 = _mm_load_si128((__m128i*)k);
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+
+            bx0 = cx0;
+            bx1 = cx1;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0];
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
+            int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
+            int64_t q = n / (d | 0x5);
+
+            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+            idx0 = d ^ q;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0];
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            n  = ((int64_t*)&l1[idx1 & MASK])[0];
+            d  = ((int32_t*)&l1[idx1 & MASK])[2];
+            q = n / (d | 0x5);
+
+            ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
+            idx1 = d ^ q;
+        }
+#undef BYTE
+
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 };
 
@@ -1732,18 +2093,18 @@ public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -1850,33 +2211,33 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
   inline static void hashPowV2(const uint8_t* __restrict__ input,
                           size_t size,
                           uint8_t* __restrict__ output,
-                          cryptonight_ctx* __restrict__ ctx)
+                          ScratchPad** __restrict__ scratchPad)
   {
-      keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-      keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-      keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
+      keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+      keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+      keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
       uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
       uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
       uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
 
-      const uint8_t* l0 = ctx->memory;
-      const uint8_t* l1 = ctx->memory + MEM;
-      const uint8_t* l2 = ctx->memory + 2 * MEM;
-      uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-      uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-      uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+      const uint8_t* l0 = scratchPad[0]->memory;
+      const uint8_t* l1 = scratchPad[1]->memory;
+      const uint8_t* l2 = scratchPad[2]->memory;
+      uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+      uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+      uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
       cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
       cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2000,33 +2361,33 @@ public:
       keccakf(h1, 24);
       keccakf(h2, 24);
 
-      extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-      extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-      extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+      extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+      extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+      extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
   }
 
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
         uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2156,26 +2517,26 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2303,26 +2664,26 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2450,9 +2811,227 @@ public:
         keccakf(h1, 24);
         keccakf(h2, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+
+        union alignas(16) {
+            uint32_t k[4];
+            uint64_t v64[2];
+        };
+        alignas(16) uint32_t x[4];
+
+#define BYTE(p, i) ((unsigned char*)&p)[i]
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+            __m128i cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+            __m128i cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+            const __m128i& key0 = _mm_set_epi64x(ah0, al0);
+
+            _mm_store_si128((__m128i*)k, key0);
+            cx0 = _mm_xor_si128(cx0, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*)x, cx0);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+
+            cx0 = _mm_load_si128((__m128i*)k);
+
+            const __m128i& key1 = _mm_set_epi64x(ah1, al1);
+
+            _mm_store_si128((__m128i*)k, key1);
+            cx1 = _mm_xor_si128(cx1, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*)x, cx1);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+
+            cx1 = _mm_load_si128((__m128i*)k);
+
+            const __m128i& key2 = _mm_set_epi64x(ah2, al2);
+
+            _mm_store_si128((__m128i*)k, key2);
+            cx2 = _mm_xor_si128(cx2, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+            _mm_store_si128((__m128i*)x, cx2);
+
+            k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+            x[0] ^= k[0];
+            k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+            x[1] ^= k[1];
+            k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+            x[2] ^= k[2];
+            k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+
+            cx2 = _mm_load_si128((__m128i*)k);
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ((uint64_t*)&l0[idx0 & MASK])[1] ^= ((uint64_t*)&l0[idx0 & MASK])[0];
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            int64_t n  = ((int64_t*)&l0[idx0 & MASK])[0];
+            int32_t d  = ((int32_t*)&l0[idx0 & MASK])[2];
+            int64_t q = n / (d | 0x5);
+
+            ((int64_t*)&l0[idx0 & MASK])[0] = n ^ q;
+            idx0 = d ^ q;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ((uint64_t*)&l1[idx1 & MASK])[1] ^= ((uint64_t*)&l1[idx1 & MASK])[0];
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            n  = ((int64_t*)&l1[idx1 & MASK])[0];
+            d  = ((int32_t*)&l1[idx1 & MASK])[2];
+            q = n / (d | 0x5);
+
+            ((int64_t*)&l1[idx1 & MASK])[0] = n ^ q;
+            idx1 = d ^ q;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
+
+            ((uint64_t*)&l2[idx2 & MASK])[1] ^= ((uint64_t*)&l2[idx2 & MASK])[0];
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            n  = ((int64_t*)&l2[idx2 & MASK])[0];
+            d  = ((int32_t*)&l2[idx2 & MASK])[2];
+            q = n / (d | 0x5);
+
+            ((int64_t*)&l2[idx2 & MASK])[0] = n ^ q;
+            idx2 = d ^ q;
+        }
+#undef BYTE
+
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 };
 
@@ -2463,21 +3042,21 @@ public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2613,39 +3192,39 @@ public:
         keccakf(h2, 24);
         keccakf(h3, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
     }
 
   inline static void hashPowV2(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
   {
-      keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-      keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-      keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
-      keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200);
+      keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+      keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+      keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+      keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
 
       uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
       uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
       uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
       uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
 
-      const uint8_t* l0 = ctx->memory;
-      const uint8_t* l1 = ctx->memory + MEM;
-      const uint8_t* l2 = ctx->memory + 2 * MEM;
-      const uint8_t* l3 = ctx->memory + 3 * MEM;
-      uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-      uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-      uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-      uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
+      const uint8_t* l0 = scratchPad[0]->memory;
+      const uint8_t* l1 = scratchPad[1]->memory;
+      const uint8_t* l2 = scratchPad[2]->memory;
+      const uint8_t* l3 = scratchPad[3]->memory;
+      uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+      uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+      uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+      uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
 
       cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
       cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -2803,39 +3382,39 @@ public:
       keccakf(h2, 24);
       keccakf(h3, 24);
 
-      extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-      extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-      extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-      extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
+      extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+      extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+      extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+      extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
   }
 
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
         uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
         uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -3001,24 +3580,32 @@ public:
         keccakf(h2, 24);
         keccakf(h3, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
+    {
+        // not supported
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
@@ -3031,24 +3618,24 @@ public:
     inline static void hash(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200);
-        keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200);
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        const uint8_t* l4 = ctx->memory + 4 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
-        uint64_t* h4 = reinterpret_cast<uint64_t*>(ctx->state[4]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -3212,46 +3799,46 @@ public:
         keccakf(h3, 24);
         keccakf(h4, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
-        extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
     }
 
   inline static void hashPowV2(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                            ScratchPad** __restrict__ scratchPad)
   {
-      keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-      keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-      keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
-      keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200);
-      keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200);
+      keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+      keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+      keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+      keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
+      keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200);
 
       uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
       uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
       uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
       uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
       uint64_t tweak1_2_4 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 4 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(ctx->state[4]) + 24));
+                             *(reinterpret_cast<const uint64_t*>(scratchPad[4]->state) + 24));
 
 
-      const uint8_t* l0 = ctx->memory;
-      const uint8_t* l1 = ctx->memory + MEM;
-      const uint8_t* l2 = ctx->memory + 2 * MEM;
-      const uint8_t* l3 = ctx->memory + 3 * MEM;
-      const uint8_t* l4 = ctx->memory + 4 * MEM;
-      uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-      uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-      uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-      uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
-      uint64_t* h4 = reinterpret_cast<uint64_t*>(ctx->state[4]);
+      const uint8_t* l0 = scratchPad[0]->memory;
+      const uint8_t* l1 = scratchPad[1]->memory;
+      const uint8_t* l2 = scratchPad[2]->memory;
+      const uint8_t* l3 = scratchPad[3]->memory;
+      const uint8_t* l4 = scratchPad[4]->memory;
+      uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+      uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+      uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+      uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+      uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
 
       cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
       cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -3442,46 +4029,46 @@ public:
       keccakf(h3, 24);
       keccakf(h4, 24);
 
-      extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-      extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-      extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-      extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
-      extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128);
+      extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+      extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+      extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+      extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+      extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
   }
 
-    inline static void hashLiteIpbc(const uint8_t* __restrict__ input,
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                 ScratchPad** __restrict__ scratchPad)
     {
-        keccak((const uint8_t*) input, (int) size, ctx->state[0], 200);
-        keccak((const uint8_t*) input + size, (int) size, ctx->state[1], 200);
-        keccak((const uint8_t*) input + 2 * size, (int) size, ctx->state[2], 200);
-        keccak((const uint8_t*) input + 3 * size, (int) size, ctx->state[3], 200);
-        keccak((const uint8_t*) input + 4 * size, (int) size, ctx->state[4], 200);
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200);
 
         uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[0]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
         uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[1]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
         uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[2]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
         uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[3]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
         uint64_t tweak1_2_4 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 4 * size) ^
-                               *(reinterpret_cast<const uint64_t*>(ctx->state[4]) + 24));
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[4]->state) + 24));
 
 
-        const uint8_t* l0 = ctx->memory;
-        const uint8_t* l1 = ctx->memory + MEM;
-        const uint8_t* l2 = ctx->memory + 2 * MEM;
-        const uint8_t* l3 = ctx->memory + 3 * MEM;
-        const uint8_t* l4 = ctx->memory + 4 * MEM;
-        uint64_t* h0 = reinterpret_cast<uint64_t*>(ctx->state[0]);
-        uint64_t* h1 = reinterpret_cast<uint64_t*>(ctx->state[1]);
-        uint64_t* h2 = reinterpret_cast<uint64_t*>(ctx->state[2]);
-        uint64_t* h3 = reinterpret_cast<uint64_t*>(ctx->state[3]);
-        uint64_t* h4 = reinterpret_cast<uint64_t*>(ctx->state[4]);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
 
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
         cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
@@ -3682,27 +4269,36 @@ public:
         keccakf(h3, 24);
         keccakf(h4, 24);
 
-        extra_hashes[ctx->state[0][0] & 3](ctx->state[0], 200, output);
-        extra_hashes[ctx->state[1][0] & 3](ctx->state[1], 200, output + 32);
-        extra_hashes[ctx->state[2][0] & 3](ctx->state[2], 200, output + 64);
-        extra_hashes[ctx->state[3][0] & 3](ctx->state[3], 200, output + 96);
-        extra_hashes[ctx->state[4][0] & 3](ctx->state[4], 200, output + 128);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
     }
 
     inline static void hashHeavy(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t* __restrict__ output,
-                            cryptonight_ctx* __restrict__ ctx)
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
 
     inline static void hashHeavyHaven(const uint8_t* __restrict__ input,
-                                 size_t size,
-                                 uint8_t* __restrict__ output,
-                                 cryptonight_ctx* __restrict__ ctx)
+                                      size_t size,
+                                      uint8_t* __restrict__ output,
+                                      ScratchPad** __restrict__ scratchPad)
+    {
+        // not supported
+    }
+
+    inline static void hashHeavyTube(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad)
     {
         // not supported
     }
 };
+
 #endif /* __CRYPTONIGHT_X86_H__ */
diff --git a/src/crypto/SSE2NEON.h b/src/crypto/SSE2NEON.h
index 6a00448d..0b8413fc 100644
--- a/src/crypto/SSE2NEON.h
+++ b/src/crypto/SSE2NEON.h
@@ -1189,6 +1189,12 @@ FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
 	return vreinterpretq_m128i_u32(vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
 }
 
+// Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers in b for greater than. https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx
+FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
+{
+	return vreinterpretq_m128i_u32(vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
+}
+
 // Compares the four 32-bit floats in a and b to check if any values are NaN. Ordered compare between each value returns true for "orderable" and false for "not orderable" (NaN). https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx
 // see also:
 // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
diff --git a/src/crypto/soft_aes.h b/src/crypto/soft_aes.h
index 0703f98d..20c67c09 100644
--- a/src/crypto/soft_aes.h
+++ b/src/crypto/soft_aes.h
@@ -105,12 +105,29 @@ static inline __m128i soft_aesenc(const uint32_t* in, __m128i key)
     return _mm_xor_si128(out, key);
 }
 
+static inline __m128i soft_aesenc(__m128i in, __m128i key)
+{
+    uint32_t x0, x1, x2, x3;
+    x0 = _mm_cvtsi128_si32(in);
+    x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0x55));
+    x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xAA));
+    x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(in, 0xFF));
+
+    __m128i out = _mm_set_epi32(
+            (saes_table[0][x3 & 0xff] ^ saes_table[1][(x0 >> 8) & 0xff] ^ saes_table[2][(x1 >> 16) & 0xff] ^ saes_table[3][x2 >> 24]),
+            (saes_table[0][x2 & 0xff] ^ saes_table[1][(x3 >> 8) & 0xff] ^ saes_table[2][(x0 >> 16) & 0xff] ^ saes_table[3][x1 >> 24]),
+            (saes_table[0][x1 & 0xff] ^ saes_table[1][(x2 >> 8) & 0xff] ^ saes_table[2][(x3 >> 16) & 0xff] ^ saes_table[3][x0 >> 24]),
+            (saes_table[0][x0 & 0xff] ^ saes_table[1][(x1 >> 8) & 0xff] ^ saes_table[2][(x2 >> 16) & 0xff] ^ saes_table[3][x3 >> 24]));
+
+    return _mm_xor_si128(out, key);
+}
+
 static inline uint32_t sub_word(uint32_t key)
 {
-    return (saes_sbox[key >> 24 ] << 24)   | 
-        (saes_sbox[(key >> 16) & 0xff] << 16 ) | 
-        (saes_sbox[(key >> 8)  & 0xff] << 8  ) | 
-         saes_sbox[key & 0xff];
+    return (saes_sbox[key >> 24 ] << 24)   |
+           (saes_sbox[(key >> 16) & 0xff] << 16 ) |
+           (saes_sbox[(key >> 8)  & 0xff] << 8  ) |
+           saes_sbox[key & 0xff];
 }
 
 #if defined(__clang__) || defined(XMRIG_ARM)
diff --git a/src/log/Log.h b/src/log/Log.h
index ea0fe1b3..53564394 100644
--- a/src/log/Log.h
+++ b/src/log/Log.h
@@ -73,6 +73,19 @@ private:
 };
 
 
+#define RED_BOLD(x)     "\x1B[1;31m" x "\x1B[0m"
+#define RED(x)          "\x1B[0;31m" x "\x1B[0m"
+#define GREEN_BOLD(x)   "\x1B[1;32m" x "\x1B[0m"
+#define GREEN(x)        "\x1B[0;32m" x "\x1B[0m"
+#define MAGENTA_BOLD(x) "\x1B[1;35m" x "\x1B[0m"
+#define MAGENTA(x)      "\x1B[0;35m" x "\x1B[0m"
+#define CYAN_BOLD(x)    "\x1B[1;36m" x "\x1B[0m"
+#define CYAN(x)         "\x1B[0;36m" x "\x1B[0m"
+#define WHITE_BOLD(x)   "\x1B[1;37m" x "\x1B[0m"
+#define WHITE(x)        "\x1B[0;37m" x "\x1B[0m"
+#define YELLOW_BOLD(x)  "\x1B[1;33m" x "\x1B[0m"
+#define YELLOW(x)       "\x1B[0;33m" x "\x1B[0m"
+
 #define LOG_ERR(x, ...)    Log::i()->message(Log::ERR,     x, ##__VA_ARGS__)
 #define LOG_WARN(x, ...)   Log::i()->message(Log::WARNING, x, ##__VA_ARGS__)
 #define LOG_NOTICE(x, ...) Log::i()->message(Log::NOTICE,  x, ##__VA_ARGS__)
diff --git a/src/version.h b/src/version.h
index 33cfdd7b..776d2767 100644
--- a/src/version.h
+++ b/src/version.h
@@ -36,14 +36,14 @@
 #define APP_DESC      "XMRigCC CPU miner"
 #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id"
 #endif
-#define APP_VERSION   "1.6.4 (based on XMRig)"
+#define APP_VERSION   "1.6.5_beta1 (based on XMRig)"
 #define APP_DOMAIN    ""
 #define APP_SITE      "https://github.com/Bendr0id/xmrigCC"
 #define APP_KIND      "cpu"
 
 #define APP_VER_MAJOR  1
 #define APP_VER_MINOR  6
-#define APP_VER_BUILD  4
+#define APP_VER_BUILD  5
 #define APP_VER_REV    0
 
 #ifndef NDEBUG
diff --git a/src/workers/Handle.cpp b/src/workers/Handle.cpp
index c461cee7..89906a36 100644
--- a/src/workers/Handle.cpp
+++ b/src/workers/Handle.cpp
@@ -25,7 +25,7 @@
 #include "workers/Handle.h"
 
 
-Handle::Handle(int threadId, int threads, int64_t affinity, int priority) :
+Handle::Handle(size_t threadId, size_t threads, int64_t affinity, int priority) :
     m_priority(priority),
     m_threadId(threadId),
     m_threads(threads),
diff --git a/src/workers/Handle.h b/src/workers/Handle.h
index 9faae0d0..21506faf 100644
--- a/src/workers/Handle.h
+++ b/src/workers/Handle.h
@@ -35,21 +35,21 @@ class IWorker;
 class Handle
 {
 public:
-    Handle(int threadId, int threads, int64_t affinity, int priority);
+    Handle(size_t threadId, size_t threads, int64_t affinity, int priority);
     void join();
     void start(void (*callback) (void *));
 
     inline int priority() const            { return m_priority; }
-    inline int threadId() const            { return m_threadId; }
-    inline int threads() const             { return m_threads; }
+    inline size_t threadId() const         { return m_threadId; }
+    inline size_t threads() const          { return m_threads; }
     inline int64_t affinity() const        { return m_affinity; }
     inline IWorker *worker() const         { return m_worker; }
     inline void setWorker(IWorker *worker) { m_worker = worker; }
 
 private:
     int m_priority;
-    int m_threadId;
-    int m_threads;
+    size_t m_threadId;
+    size_t m_threads;
     int64_t m_affinity;
     IWorker *m_worker;
     uv_thread_t m_thread;
diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp
index d1d16ad6..e599b87f 100644
--- a/src/workers/MultiWorker.cpp
+++ b/src/workers/MultiWorker.cpp
@@ -24,6 +24,7 @@
 
 
 #include <thread>
+#include <log/Log.h>
 
 
 #include "crypto/CryptoNight.h"
@@ -35,7 +36,7 @@
 class MultiWorker : public Worker
 {
 public:
-    explicit MultiWorker(Handle *handle, size_t hashMultiplier);
+    explicit MultiWorker(Handle *handle, size_t hashFactor);
     ~MultiWorker();
 
     void start() override;
@@ -50,7 +51,10 @@ private:
     uint8_t* m_hash;
     State *m_state;
     State *m_pausedState;
-    size_t m_hashMultiplier;
+    size_t m_hashFactor;
+
+    ScratchPadMem scratchPadMem;
+    ScratchPad* scratchPads[MAX_NUM_HASH_BLOCKS];
 };
 
 class MultiWorker::State
@@ -77,13 +81,14 @@ public:
 };
 
 
-MultiWorker::MultiWorker(Handle *handle, size_t hashMultiplier)
+MultiWorker::MultiWorker(Handle *handle, size_t hashFactor)
     : Worker(handle),
-      m_hash(new uint8_t[32 * hashMultiplier]),
-      m_state(new MultiWorker::State(hashMultiplier)),
-      m_pausedState(new MultiWorker::State(hashMultiplier)),
-      m_hashMultiplier(hashMultiplier)
+      m_hash(new uint8_t[32 * hashFactor]),
+      m_state(new MultiWorker::State(hashFactor)),
+      m_pausedState(new MultiWorker::State(hashFactor)),
+      m_hashFactor(hashFactor)
 {
+    scratchPadMem = Mem::create(scratchPads, m_id);
 }
 
 MultiWorker::~MultiWorker()
@@ -91,10 +96,25 @@ MultiWorker::~MultiWorker()
     delete[] m_hash;
     delete m_state;
     delete m_pausedState;
+
+    Mem::release(scratchPads, scratchPadMem, m_id);
 }
 
 void MultiWorker::start()
 {
+    const size_t memory  = scratchPadMem.realSize / 1048576;
+
+    if (Options::i()->colors()) {
+        LOG_INFO(WHITE_BOLD("Starting thread ") GREEN_BOLD("%zu/%zu") " affined to core: " GREEN_BOLD("#%d") " -> huge pages:" GREEN_BOLD(" %s%zu/%zu") " scratchpad: " CYAN_BOLD("%zu.0 MB"),
+            m_id+1, Options::i()->threads(), m_affinedCpu,
+                 (scratchPadMem.hugePages == scratchPadMem.pages ? "\x1B[1;32m" : (scratchPadMem.hugePages == 0 ? "\x1B[1;31m" : "\x1B[1;33m")),
+                  scratchPadMem.hugePages, scratchPadMem.pages, memory);
+    }
+    else {
+        LOG_INFO("Starting thread %zu/%zu affined to core: #%d -> huge pages: %zu/%zu scratchpad: %zu.0 MB",
+                         m_id+1, Options::i()->threads(), m_affinedCpu, scratchPadMem.hugePages, scratchPadMem.pages, memory);
+    }
+
     while (Workers::sequence() > 0) {
         if (Workers::isPaused()) {
             do {
@@ -114,15 +134,15 @@ void MultiWorker::start()
                 storeStats();
             }
 
-            m_count += m_hashMultiplier;
+            m_count += m_hashFactor;
 
-            for (size_t i=0; i < m_hashMultiplier; ++i) {
+            for (size_t i=0; i < m_hashFactor; ++i) {
                 *Job::nonce(m_state->blob + i * m_state->job.size()) = ++m_state->nonces[i];
             }
 
-            CryptoNight::hash(m_hashMultiplier, m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, m_ctx);
+            CryptoNight::hash(m_hashFactor, m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads);
 
-            for (size_t i=0; i < m_hashMultiplier; ++i) {
+            for (size_t i=0; i < m_hashFactor; ++i) {
                 if (*reinterpret_cast<uint64_t *>(m_hash + 24 + i * 32) < m_state->job.target()) {
                     Workers::submit(JobResult(m_state->job.poolId(), m_state->job.id(), m_state->nonces[i], m_hash + i * 32,
                                               m_state->job.diff()), m_id);
@@ -162,7 +182,7 @@ void MultiWorker::consumeJob()
 
     m_state->job = std::move(job);
 
-    for (size_t i=0; i < m_hashMultiplier; ++i) {
+    for (size_t i=0; i < m_hashFactor; ++i) {
         memcpy(m_state->blob + i * m_state->job.size(), m_state->job.blob(), m_state->job.size());
         if (m_state->job.isNicehash()) {
             m_state->nonces[i] = (*Job::nonce(m_state->blob + i * m_state->job.size()) & 0xff000000U) +
@@ -183,6 +203,6 @@ void MultiWorker::save(const Job &job)
     }
 }
 
-Worker* createMultiWorker(size_t numHashes, Handle *handle) {
-    return new MultiWorker(handle, numHashes);
+Worker* createMultiWorker(Handle *handle, size_t hashFactor) {
+    return new MultiWorker(handle, hashFactor);
 }
\ No newline at end of file
diff --git a/src/workers/MultiWorker.h b/src/workers/MultiWorker.h
index 33f5a062..14b3d13d 100644
--- a/src/workers/MultiWorker.h
+++ b/src/workers/MultiWorker.h
@@ -33,7 +33,7 @@
 
 class Handle;
 
-Worker* createMultiWorker(size_t numHashes, Handle *handle);
+Worker* createMultiWorker(Handle *handle, size_t hashFactor);
 
 
 #endif /* __SINGLEWORKER_H__ */
diff --git a/src/workers/Worker.cpp b/src/workers/Worker.cpp
index 02646ced..b5a84f8b 100644
--- a/src/workers/Worker.cpp
+++ b/src/workers/Worker.cpp
@@ -39,12 +39,11 @@ Worker::Worker(Handle *handle) :
     m_count(0),
     m_sequence(0)
 {
-    if (Cpu::threads() > 1 && handle->affinity() != -1L) {
-        Cpu::setAffinity(m_id, handle->affinity());
+    if (m_threads > 1 && m_threads <= Cpu::threads()) {
+        m_affinedCpu = Cpu::setThreadAffinity(m_id, handle->affinity());
     }
 
     Platform::setThreadPriority(handle->priority());
-    m_ctx = Mem::create(m_id);
 }
 
 
diff --git a/src/workers/Worker.h b/src/workers/Worker.h
index 11c4a198..9abf2ec3 100644
--- a/src/workers/Worker.h
+++ b/src/workers/Worker.h
@@ -32,7 +32,7 @@
 #include "interfaces/IWorker.h"
 
 
-struct cryptonight_ctx;
+struct ScratchPad;
 class Handle;
 
 
@@ -48,9 +48,9 @@ public:
 protected:
     void storeStats();
 
-    cryptonight_ctx *m_ctx;
     int m_id;
-    int m_threads;
+    int m_affinedCpu;
+    size_t m_threads;
     std::atomic<uint64_t> m_hashCount;
     std::atomic<uint64_t> m_timestamp;
     uint64_t m_count;
diff --git a/src/workers/Workers.cpp b/src/workers/Workers.cpp
index e225bcc5..ef786842 100644
--- a/src/workers/Workers.cpp
+++ b/src/workers/Workers.cpp
@@ -100,9 +100,8 @@ void Workers::setJob(const Job &job)
 }
 
 
-void Workers::start(int64_t affinity, int priority)
+void Workers::start(size_t threads, int64_t affinityMask, int priority)
 {
-    const int threads = Mem::threads();
     m_hashrate = new Hashrate(threads);
 
     uv_mutex_init(&m_mutex);
@@ -115,8 +114,8 @@ void Workers::start(int64_t affinity, int priority)
     uv_timer_init(uv_default_loop(), &m_timer);
     uv_timer_start(&m_timer, Workers::onTick, 500, 500);
 
-    for (int i = 0; i < threads; ++i) {
-        auto handle = new Handle(i, threads, affinity, priority);
+    for (size_t i = 0; i < threads; ++i) {
+        auto handle = new Handle(i, threads, affinityMask, priority);
         m_workers.push_back(handle);
         handle->start(Workers::onReady);
     }
@@ -151,7 +150,7 @@ void Workers::submit(const JobResult &result, int threadId)
 void Workers::onReady(void *arg)
 {
     auto handle = static_cast<Handle*>(arg);
-    handle->setWorker(createMultiWorker(Mem::getThreadHashFactor(handle->threadId()), handle));
+    handle->setWorker(createMultiWorker(handle, Mem::getThreadHashFactor(handle->threadId())));
     handle->worker()->start();
 }
 
diff --git a/src/workers/Workers.h b/src/workers/Workers.h
index 22a2b376..c21f5564 100644
--- a/src/workers/Workers.h
+++ b/src/workers/Workers.h
@@ -46,7 +46,7 @@ public:
     static void printHashrate(bool detail);
     static void setEnabled(bool enabled);
     static void setJob(const Job &job);
-    static void start(int64_t affinity, int priority);
+    static void start(size_t threads, int64_t affinityMask, int priority);
     static void stop();
     static void submit(const JobResult &result, int threadId);
 
diff --git a/test/cryptonight/cryptonight.c b/test/cryptonight/cryptonight.c
index bcc0db30..dd0ec615 100644
--- a/test/cryptonight/cryptonight.c
+++ b/test/cryptonight/cryptonight.c
@@ -26,10 +26,10 @@ const static char input2[] = "This is a test";
 const static char input3[] = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Vivamus pellentesque metus.";
 
 
-void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct ScratchPad* ctx);
+void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct ScratchPad* ctx);
+void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct ScratchPad* ctx);
+void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct ScratchPad* ctx);
 
 
 static char hash[64];
@@ -55,21 +55,21 @@ static char *bin2hex(const unsigned char *p, size_t len)
 
 
 static void * create_ctx(int ratio) {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16);
+    struct ScratchPad *ctx = (struct ScratchPad*) _mm_malloc(sizeof(struct ScratchPad), 16);
     ctx->memory = (uint8_t *) _mm_malloc(MEMORY * ratio, 16);
 
     return ctx;
 }
 
 
-static void free_ctx(struct cryptonight_ctx *ctx) {
+static void free_ctx(struct ScratchPad *ctx) {
     _mm_free(ctx->memory);
     _mm_free(ctx);
 }
 
 
 void test_cryptonight_av1_should_CalcHash(void) {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1);
 
     cryptonight_av1_aesni(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));
@@ -86,7 +86,7 @@ void test_cryptonight_av1_should_CalcHash(void) {
 
 void test_cryptonight_av2_should_CalcHash(void)
 {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2);
 
     cryptonight_av2_aesni_double(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));
@@ -97,7 +97,7 @@ void test_cryptonight_av2_should_CalcHash(void)
 
 void test_cryptonight_av3_should_CalcHash(void)
 {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1);
 
     cryptonight_av3_softaes(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));
@@ -114,7 +114,7 @@ void test_cryptonight_av3_should_CalcHash(void)
 
 void test_cryptonight_av4_should_CalcHash(void)
 {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2);
 
     cryptonight_av4_softaes_double(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));
diff --git a/test/cryptonight_lite/cryptonight_lite.c b/test/cryptonight_lite/cryptonight_lite.c
index a6d5b554..61319da2 100644
--- a/test/cryptonight_lite/cryptonight_lite.c
+++ b/test/cryptonight_lite/cryptonight_lite.c
@@ -24,15 +24,15 @@ const static char input1[152] = {
 };
 
 
-void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx)          {}
-void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx)   {}
-void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx)        {}
-void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx) {}
+void cryptonight_av1_aesni(const void* input, size_t size, void* output, struct ScratchPad* ctx)          {}
+void cryptonight_av2_aesni_double(const void* input, size_t size, void* output, struct ScratchPad* ctx)   {}
+void cryptonight_av3_softaes(const void* input, size_t size, void* output, struct ScratchPad* ctx)        {}
+void cryptonight_av4_softaes_double(const void* input, size_t size, void* output, struct ScratchPad* ctx) {}
 
-void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
-void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct cryptonight_ctx* ctx);
+void cryptonight_lite_av1_aesni(const void* input, size_t size, void* output, struct ScratchPad* ctx);
+void cryptonight_lite_av2_aesni_double(const void* input, size_t size, void* output, struct ScratchPad* ctx);
+void cryptonight_lite_av3_softaes(const void* input, size_t size, void* output, struct ScratchPad* ctx);
+void cryptonight_lite_av4_softaes_double(const void* input, size_t size, void* output, struct ScratchPad* ctx);
 
 
 static char hash[64];
@@ -56,21 +56,21 @@ static char *bin2hex(const unsigned char *p, size_t len)
 
 
 static void * create_ctx(int ratio) {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) _mm_malloc(sizeof(struct cryptonight_ctx), 16);
+    struct ScratchPad *ctx = (struct ScratchPad*) _mm_malloc(sizeof(struct ScratchPad), 16);
     ctx->memory = (uint8_t *) _mm_malloc(MEMORY_LITE * ratio, 16);
 
     return ctx;
 }
 
 
-static void free_ctx(struct cryptonight_ctx *ctx) {
+static void free_ctx(struct ScratchPad *ctx) {
     _mm_free(ctx->memory);
     _mm_free(ctx);
 }
 
 
 void test_cryptonight_lite_av1_should_CalcHash(void) {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1);
 
     cryptonight_lite_av1_aesni(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));
@@ -81,7 +81,7 @@ void test_cryptonight_lite_av1_should_CalcHash(void) {
 
 void test_cryptonight_lite_av2_should_CalcHash(void)
 {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2);
 
     cryptonight_lite_av2_aesni_double(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));
@@ -91,7 +91,7 @@ void test_cryptonight_lite_av2_should_CalcHash(void)
 
 
 void test_cryptonight_lite_av3_should_CalcHash(void) {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(1);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(1);
 
     cryptonight_lite_av3_softaes(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1, bin2hex(hash, 32));
@@ -102,7 +102,7 @@ void test_cryptonight_lite_av3_should_CalcHash(void) {
 
 void test_cryptonight_lite_av4_should_CalcHash(void)
 {
-    struct cryptonight_ctx *ctx = (struct cryptonight_ctx*) create_ctx(2);
+    struct ScratchPad *ctx = (struct ScratchPad*) create_ctx(2);
 
     cryptonight_lite_av4_softaes_double(input1, 76, &hash, ctx);
     TEST_ASSERT_EQUAL_STRING(RESULT1_DOUBLE, bin2hex(hash, 64));