diff --git a/CMakeLists.txt b/CMakeLists.txt
index 25828c3e..c4e30ea1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ option(WITH_HTTPD     "HTTP REST API" OFF)
 option(WITH_CC_CLIENT "CC Client" ON)
 option(WITH_CC_SERVER "CC Server" ON)
 option(WITH_TLS       "TLS support" ON)
+option(WITH_ASM       "ASM optimizations" ON)
 option(BUILD_STATIC   "Build static binary" OFF)
 set(Boost_USE_STATIC_RUNTIME ON)
 set(Boost_USE_STATIC_LIBS ON)
@@ -128,7 +129,7 @@ find_package(UV REQUIRED)
 
 if (WIN32)
     add_definitions(-DBOOST_ALL_NO_LIB)
-endif()
+endif(WIN32)
 
 find_package(Boost 1.63.0 COMPONENTS system REQUIRED)
 
@@ -144,10 +145,10 @@ if (WITH_TLS)
         set(SOURCES_SSL_TLS src/net/BoostTlsConnection.cpp)
     else()
         message(FATAL_ERROR "OpenSSL NOT found: use `-DWITH_TLS=OFF` to build without TLS support")
-    endif()
+    endif(OPENSSL_FOUND)
 else()
     add_definitions(/DXMRIG_NO_TLS)
-endif()
+endif(WITH_TLS)
 
 if (WITH_LIBCPUID)
     add_subdirectory(src/3rdparty/libcpuid)
@@ -162,8 +163,8 @@ else()
         set(SOURCES_CPUID src/Cpu_arm.cpp)
     else()
         set(SOURCES_CPUID src/Cpu_stub.cpp)
-    endif()
-endif()
+    endif(XMRIG_ARM)
+endif(WITH_LIBCPUID)
 
 CHECK_INCLUDE_FILE (syslog.h HAVE_SYSLOG_H)
 if (HAVE_SYSLOG_H)
@@ -179,11 +180,11 @@ if (WITH_HTTPD)
         set(HTTPD_SOURCES src/api/Httpd.h src/api/Httpd.cpp)
     else()
         message(FATAL_ERROR "microhttpd NOT found: use `-DWITH_HTTPD=OFF` to build without http deamon support")
-    endif()
+    endif(MHD_FOUND)
 else()
     add_definitions(/DXMRIG_NO_HTTPD)
     add_definitions(/DXMRIG_NO_API)
-endif()
+endif(WITH_HTTPD)
 
 if (WITH_CC_SERVER)
     find_package(MHD)
@@ -192,7 +193,7 @@ if (WITH_CC_SERVER)
         include_directories(${MHD_INCLUDE_DIRS})
     else()
         message(FATAL_ERROR "microhttpd NOT found: use `-DWITH_CC_SERVER=OFF` to build without CC Server support")
-    endif()
+    endif(MHD_FOUND)
 
     set(SOURCES_CC_SERVER
             src/cc/CCServer.cpp
@@ -201,12 +202,12 @@ if (WITH_CC_SERVER)
             src/cc/Httpd.cpp
             src/cc/XMRigCC.cpp
             )
-endif()
+endif(WITH_CC_SERVER)
 
 if (WITH_CC_CLIENT)
     set(SOURCES_CC_CLIENT
             src/cc/CCClient.cpp)
-endif()
+endif(WITH_CC_CLIENT)
 
 if (WITH_CC_SERVER OR WITH_CC_CLIENT)
     set(SOURCES_CC_COMMON
@@ -215,11 +216,34 @@ if (WITH_CC_SERVER OR WITH_CC_CLIENT)
             src/cc/GPUInfo.cpp)
 else()
     add_definitions(/DXMRIG_NO_CC)
-endif()
+endif(WITH_CC_SERVER OR WITH_CC_CLIENT)
+
+if (WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+    if (CMAKE_C_COMPILER_ID MATCHES MSVC)
+        enable_language(ASM_MASM)
+        set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.asm")
+        set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY ASM_MASM)
+    else()
+        enable_language(ASM)
+
+        if (WIN32 AND CMAKE_C_COMPILER_ID MATCHES GNU)
+            set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop_win_gcc.S")
+        else()
+            set(XMRIG_ASM_FILE "src/crypto/asm/cn_main_loop.S")
+        endif()
+
+        set_property(SOURCE ${XMRIG_ASM_FILE} PROPERTY C)
+    endif()
+
+    add_library(xmrig_asm STATIC ${XMRIG_ASM_FILE})
+    set_property(TARGET xmrig_asm PROPERTY LINKER_LANGUAGE C)
+else()
+    add_definitions(/DXMRIG_NO_ASM)
+endif(WITH_ASM AND NOT XMRIG_ARM AND CMAKE_SIZEOF_VOID_P EQUAL 8)
 
 if (BUILD_STATIC)
     set(CMAKE_EXE_LINKER_FLAGS " -static")
-endif()
+endif(BUILD_STATIC)
 
 include_directories(src)
 include_directories(src/3rdparty)
@@ -246,12 +270,16 @@ target_link_libraries(xmrigMiner xmrig_common xmrig_os_dependencies xmrig_cpuid
 
 if (WITH_CC_CLIENT)
     target_link_libraries(xmrigMiner xmrig_cc_common)
-endif (WITH_CC_CLIENT)
+endif(WITH_CC_CLIENT)
 
 if (WITH_TLS)
     target_link_libraries(xmrigMiner xmrig_tls ${OPENSSL_LIBRARIES} ${EXTRA_LIBS})
     target_link_libraries(xmrigMiner xmrig_tls ${OPENSSL_LIBRARIES} ${EXTRA_LIBS})
-endif (WITH_TLS)
+endif(WITH_TLS)
+
+if (WITH_ASM)
+    target_link_libraries(xmrigMiner xmrig_asm)
+endif(WITH_ASM)
 
 add_executable(xmrigDaemon src/cc/XMRigd.cpp res/app.rc)
 set_target_properties(xmrigDaemon PROPERTIES OUTPUT_NAME ${DAEMON_EXECUTABLE_NAME})
@@ -269,6 +297,6 @@ if (WITH_CC_SERVER AND MHD_FOUND)
 
     set_target_properties(xmrig_common_cc PROPERTIES COMPILE_FLAGS "-DXMRIG_CC_SERVER ${SHARED_FLAGS}")
     set_target_properties(xmrigCCServer PROPERTIES COMPILE_FLAGS "-DXMRIG_CC_SERVER ${SHARED_FLAGS}")
-endif()
+endif(WITH_CC_SERVER AND MHD_FOUND)
 
 add_subdirectory(test EXCLUDE_FROM_ALL)
diff --git a/appveyor.yml b/appveyor.yml
index 80ff01d3..5cfcf8e9 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -16,7 +16,7 @@ clone_folder: c:\xmrigCC
 
 install:
   - mkdir c:\xmrigCC-deps
-  - curl -sL https://github.com/Bendr0id/xmrigCC-deps/releases/download/v2/xmrigCC-deps.zip -o xmrigCC-deps.zip
+  - curl -sL https://github.com/Bendr0id/xmrigCC-deps/releases/download/v3/xmrigCC-deps.zip -o xmrigCC-deps.zip
   - 7z x xmrigCC-deps.zip -o"c:\xmrigCC-deps" -y > nul
 
 build_script:
diff --git a/src/AsmOptimization.h b/src/AsmOptimization.h
new file mode 100644
index 00000000..0662d305
--- /dev/null
+++ b/src/AsmOptimization.h
@@ -0,0 +1,89 @@
+/* XMRigCC
+ * Copyright 2018-     BenDr0id    <ben@graef.in>
+ *
+ *
+ *   This program is free software: you can redistribute it and/or modify
+ *   it under the terms of the GNU General Public License as published by
+ *   the Free Software Foundation, either version 3 of the License, or
+ *   (at your option) any later version.
+ *
+ *   This program is distributed in the hope that it will be useful,
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ *   GNU General Public License for more details.
+ *
+ *   You should have received a copy of the GNU General Public License
+ *   along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef __ASM_OPTIMIZATION_H__
+#define __ASM_OPTIMIZATION_H__
+
+#include <string>
+#include <algorithm>
+
+enum AsmOptimization
+{
+    ASM_AUTODETECT,
+    ASM_INTEL,
+    ASM_RYZEN,
+    ASM_NONE
+};
+
+inline std::string getAsmOptimizationName(AsmOptimization asmOptimization)
+{
+    switch (asmOptimization)
+    {
+        case ASM_INTEL:
+            return "INTEL";
+        case ASM_RYZEN:
+            return "RYZEN";
+        case ASM_NONE:
+            return "OFF";
+        case ASM_AUTODETECT:
+        default:
+            return "-1";
+    }
+}
+
+inline AsmOptimization parseAsmOptimization(int optimization)
+{
+    AsmOptimization asmOptimization = AsmOptimization::ASM_AUTODETECT;
+
+    switch (optimization) {
+        case -1:
+            asmOptimization = AsmOptimization::ASM_AUTODETECT;
+            break;
+        case 0:
+            asmOptimization = AsmOptimization::ASM_NONE;
+            break;
+        case 1:
+            asmOptimization = AsmOptimization::ASM_INTEL;
+            break;
+        case 2:
+            asmOptimization = AsmOptimization::ASM_RYZEN;
+            break;
+        default:
+            break;
+    }
+
+    return asmOptimization;
+}
+
+inline AsmOptimization parseAsmOptimization(const std::string optimization)
+{
+    AsmOptimization asmOptimization = AsmOptimization::ASM_AUTODETECT;
+
+    if (optimization == "0" || optimization == "none" || optimization == "off") {
+        asmOptimization = AsmOptimization::ASM_NONE;
+    } else if (optimization == "1" || optimization == "intel") {
+        asmOptimization = AsmOptimization::ASM_INTEL;
+    } else if (optimization == "2" || optimization == "ryzen") {
+        asmOptimization = AsmOptimization::ASM_RYZEN;
+    }
+
+    return asmOptimization;
+}
+
+
+#endif /* __ASM_OPTIMIZATION_H__ */
diff --git a/src/Cpu.cpp b/src/Cpu.cpp
index 73fcdfb4..d5da6949 100644
--- a/src/Cpu.cpp
+++ b/src/Cpu.cpp
@@ -48,6 +48,7 @@ CpuImpl::CpuImpl()
     , m_sockets(1)
     , m_totalCores(0)
     , m_totalThreads(0)
+    , m_asmOptimization(AsmOptimization::ASM_NONE)
 {
 }
 
@@ -86,9 +87,9 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor,
         if (threadsCount > maximumReasonableThreadCount) {
             threadsCount = maximumReasonableThreadCount;
         }
-        if (hashFactor > maximumReasonableFactor / threadsCount) {
+        if (threadsCount > 0 && hashFactor > maximumReasonableFactor / threadsCount) {
             hashFactor = std::min(maximumReasonableFactor / threadsCount, maximumReasonableHashFactor);
-            hashFactor   = std::max(hashFactor, static_cast<size_t>(1));
+            hashFactor  = std::max(hashFactor, static_cast<size_t>(1));
         }
     }
 
@@ -106,9 +107,10 @@ void CpuImpl::optimizeParameters(size_t& threadsCount, size_t& hashFactor,
         }
         threadsCount = std::max(threadsCount, static_cast<size_t>(1));
     }
+
     if (hashFactor == 0) {
         hashFactor = std::min(maximumReasonableHashFactor, maximumReasonableFactor / threadsCount);
-        hashFactor   = std::max(hashFactor, static_cast<size_t>(1));
+        hashFactor = std::max(hashFactor, static_cast<size_t>(1));
     }
 }
 
@@ -215,3 +217,8 @@ int Cpu::getAssignedCpuId(size_t threadId, int64_t affinityMask)
 
     return cpuId;
 }
+
+AsmOptimization Cpu::asmOptimization()
+{
+    return CpuImpl::instance().asmOptimization();
+}
diff --git a/src/Cpu.h b/src/Cpu.h
index a9161d67..4f8821d8 100644
--- a/src/Cpu.h
+++ b/src/Cpu.h
@@ -54,6 +54,7 @@ public:
     static size_t threads();
     static size_t availableCache();
     static int getAssignedCpuId(size_t threadId, int64_t affinityMask);
+    static AsmOptimization asmOptimization();
 };
 
 
diff --git a/src/CpuImpl.h b/src/CpuImpl.h
index b2bec265..56288f9a 100644
--- a/src/CpuImpl.h
+++ b/src/CpuImpl.h
@@ -51,6 +51,7 @@ public:
     size_t sockets()       { return m_sockets; }
     size_t threads()       { return m_totalThreads; }
     size_t availableCache();
+    AsmOptimization asmOptimization() { return m_asmOptimization; }
 
 private:
     void initCommon();
@@ -63,6 +64,7 @@ private:
     size_t m_sockets;
     size_t m_totalCores;
     size_t m_totalThreads;
+    AsmOptimization m_asmOptimization;
 };
 
 #endif /* __CPU_IMPL_H__ */
diff --git a/src/Cpu_cpuid.cpp b/src/Cpu_cpuid.cpp
index 6251a97e..b701a994 100644
--- a/src/Cpu_cpuid.cpp
+++ b/src/Cpu_cpuid.cpp
@@ -80,4 +80,15 @@ void CpuImpl::initCommon()
     if (data.flags[CPU_FEATURE_BMI2]) {
         m_flags |= Cpu::BMI2;
     }
+
+#   ifndef XMRIG_NO_ASM
+    if (data.vendor == VENDOR_AMD && data.ext_family >= 0x17) {
+        m_asmOptimization = AsmOptimization::ASM_RYZEN;
+    } else if (data.vendor == VENDOR_INTEL &&
+            ((data.ext_family >= 0x06 && data.ext_model > 0x2) ||
+             (data.ext_family >= 0x06 && data.ext_model == 0x2 && data.model >= 0xA))) {
+        m_asmOptimization = AsmOptimization::ASM_INTEL;
+    }
+#   endif
+
 }
diff --git a/src/Options.cpp b/src/Options.cpp
index 6a69fade..dd373093 100644
--- a/src/Options.cpp
+++ b/src/Options.cpp
@@ -73,8 +73,9 @@ Options:\n"
   -k, --keepalive                       send keepalived for prevent timeout (need pool support)\n\
   -r, --retries=N                       number of times to retry before switch to backup server (default: 5)\n\
   -R, --retry-pause=N                   time to pause between retries (default: 5)\n\
-      --pow-variant=V                   specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), ipbc (tube), alloy, xtl (including autodetect for v5)\n\
+      --pow-variant=V                   specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka cnv7), 2(v2, aka cnv8), ipbc (tube), alloy, xtl (including autodetect for v5)\n\
                                         for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations\n\
+      --asm-optimization=V              specificy the ASM optimization to use: -> 'auto' (default), 'intel', 'ryzen', 'none' \n\
       --multihash-factor=N              number of hash blocks to process at a time (don't set or 0 enables automatic selection of optimal number of hash blocks)\n\
       --multihash-thread-mask=MASK      limits multihash to given threads (mask), (default: all threads)\n\
       --cpu-affinity                    set process affinity to CPU core(s), mask 0x3 for cores 0 and 1\n\
@@ -166,7 +167,7 @@ static struct option const options[] = {
     { "userpass",         1, nullptr, 'O'  },
     { "version",          0, nullptr, 'V'  },
     { "use-tls",          0, nullptr, 1015 },
-    { "force-pow-version",1, nullptr, 1016 },
+    { "multihash-thread-mask",      1, nullptr, 4013 },
     { "pow-variant"      ,1, nullptr, 1017 },
     { "api-port",         1, nullptr, 4000 },
     { "api-access-token", 1, nullptr, 4001 },
@@ -189,6 +190,7 @@ static struct option const options[] = {
     { "daemonized",       0, nullptr, 4011 },
     { "doublehash-thread-mask",     1, nullptr, 4013 },
     { "multihash-thread-mask",      1, nullptr, 4013 },
+    { "asm-optimization", 1, nullptr, 4020 },
     { nullptr, 0, nullptr, 0 }
 };
 
@@ -217,6 +219,7 @@ static struct option const config_options[] = {
     { "pow-variant",   1, nullptr, 1017 },
     { "doublehash-thread-mask",     1, nullptr, 4013 },
     { "multihash-thread-mask",     1, nullptr, 4013 },
+    { "asm-optimization", 1, nullptr, 4020 },
     { nullptr, 0, nullptr, 0 }
 };
 
@@ -282,6 +285,7 @@ constexpr static const char *pow_variant_names[] = {
         "auto",
         "0",
         "1",
+        "2",
         "tube",
         "alloy",
         "xtl",
@@ -290,6 +294,13 @@ constexpr static const char *pow_variant_names[] = {
         "rto"
 };
 
+constexpr static const char *asm_optimization_names[] = {
+    "auto",
+    "intel",
+    "ryzen",
+    "none"
+};
+
 Options *Options::parse(int argc, char **argv)
 {
     auto options = new Options(argc, argv);
@@ -342,6 +353,7 @@ Options::Options(int argc, char **argv) :
     m_algoVariant(AV0_AUTO),
     m_aesni(AESNI_AUTO),
     m_powVariant(POW_AUTODETECT),
+    m_asmOptimization(ASM_AUTODETECT),
     m_hashFactor(0),
     m_apiPort(0),
     m_donateLevel(kDonateLevel),
@@ -400,6 +412,10 @@ Options::Options(int argc, char **argv) :
 
     optimizeAlgorithmConfiguration();
 
+    if (m_asmOptimization == AsmOptimization::ASM_AUTODETECT) {
+        m_asmOptimization = Cpu::asmOptimization();
+    }
+
     for (Url *url : m_pools) {
         url->applyExceptions();
     }
@@ -588,6 +604,9 @@ bool Options::parseArg(int key, const char *arg)
     case 4019: /* --cc-upload-config-on-startup */
         return parseBoolean(key, true);
 
+    case 4020: /* --asm-optimization */
+        return parseAsmOptimization(arg);
+
     case 't':  /* --threads */
         if (strncmp(arg, "all", 3) == 0) {
             m_threads = Cpu::threads();
@@ -1015,11 +1034,16 @@ bool Options::parsePowVariant(const char *powVariant)
             break;
         }
 
-        if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "monerov7") || !strcmp(powVariant, "aeonv7") || !strcmp(powVariant, "v7"))) {
+        if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "cnv1") || !strcmp(powVariant, "monerov7") || !strcmp(powVariant, "aeonv7") || !strcmp(powVariant, "v7"))) {
             m_powVariant = POW_V1;
             break;
         }
 
+        if (i == ARRAY_SIZE(pow_variant_names) - 1 && (!strcmp(powVariant, "cnv2") || !strcmp(powVariant, "monerov8") || !strcmp(powVariant, "aeonv8") || !strcmp(powVariant, "v8"))) {
+            m_powVariant = POW_V2;
+            break;
+        }
+
         if (i == ARRAY_SIZE(pow_variant_names) - 1 && !strcmp(powVariant, "stellite")) {
             m_powVariant = POW_XTL;
             break;
@@ -1049,6 +1073,25 @@ bool Options::parsePowVariant(const char *powVariant)
     return true;
 }
 
+
+bool Options::parseAsmOptimization(const char *asmOptimization)
+{
+    for (size_t i = 0; i < ARRAY_SIZE(pow_variant_names); i++) {
+        if (pow_variant_names[i] && !strcmp(asmOptimization, asm_optimization_names[i])) {
+            m_asmOptimization = static_cast<AsmOptimization>(i);
+            break;
+        }
+
+        if (i == ARRAY_SIZE(asm_optimization_names) - 1) {
+            showUsage(1);
+            return false;
+        }
+    }
+
+    return true;
+}
+
+
 void Options::optimizeAlgorithmConfiguration()
 {
     // backwards compatibility for configs still setting algo variant (av)
@@ -1123,5 +1166,3 @@ bool Options::parseCCUrl(const char* url)
 
     return true;
 }
-
-
diff --git a/src/Options.h b/src/Options.h
index 39d26ffd..31a167fe 100644
--- a/src/Options.h
+++ b/src/Options.h
@@ -34,6 +34,7 @@
 
 #include "rapidjson/fwd.h"
 #include "PowVariant.h"
+#include "AsmOptimization.h"
 
 class Url;
 struct option;
@@ -91,6 +92,7 @@ public:
     inline const std::vector<Url*> &pools() const   { return m_pools; }
     inline Algo algo() const                        { return m_algo; }
     inline PowVariant powVariant() const            { return m_powVariant; }
+    inline AsmOptimization asmOptimization() const  { return m_asmOptimization; }
     inline bool aesni() const                       { return m_aesni == AESNI_ON; }
     inline size_t hashFactor() const                { return m_hashFactor; }
     inline int apiPort() const                      { return m_apiPort; }
@@ -136,6 +138,7 @@ private:
 
     bool setAlgo(const char *algo);
     bool parsePowVariant(const char *powVariant);
+    bool parseAsmOptimization(const char *arg);
 
     void optimizeAlgorithmConfiguration();
 
@@ -167,6 +170,7 @@ private:
     AlgoVariant m_algoVariant;
     AesNi m_aesni;
     PowVariant m_powVariant;
+    AsmOptimization m_asmOptimization;
     size_t m_hashFactor;
     int m_apiPort;
     int m_donateLevel;
diff --git a/src/PowVariant.h b/src/PowVariant.h
index fc20c02a..0bde83d6 100644
--- a/src/PowVariant.h
+++ b/src/PowVariant.h
@@ -27,6 +27,7 @@ enum PowVariant
     POW_AUTODETECT,
     POW_V0,
     POW_V1,
+    POW_V2,
     POW_TUBE,
     POW_ALLOY,
     POW_XTL,
@@ -44,6 +45,8 @@ inline std::string getPowVariantName(PowVariant powVariant)
             return "0";
         case POW_V1:
             return "1";
+        case POW_V2:
+            return "2";
         case POW_TUBE:
             return "tube";
         case POW_ALLOY:
@@ -88,6 +91,9 @@ inline PowVariant parseVariant(int variant)
         case 1:
             powVariant = PowVariant::POW_V1;
             break;
+        case 2:
+            powVariant = PowVariant::POW_V2;
+            break;
         default:
             break;
     }
@@ -104,6 +110,8 @@ inline PowVariant parseVariant(const std::string variant)
         powVariant = PowVariant::POW_V0;
     } else if (variant == "1") {
         powVariant = PowVariant::POW_V1;
+    } else if (variant == "2") {
+        powVariant = PowVariant::POW_V2;
     } else if (variant == "ipbc" || variant == "tube" || variant == "bittube") {
         powVariant = PowVariant::POW_TUBE;
     } else if (variant == "xao" || variant == "alloy") {
diff --git a/src/Summary.cpp b/src/Summary.cpp
index cfad1e14..c623d4a0 100644
--- a/src/Summary.cpp
+++ b/src/Summary.cpp
@@ -59,17 +59,21 @@ static void print_versions()
 static void print_cpu()
 {
     if (Options::i()->colors()) {
-        Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU:          %s (%d) %sx64 %sAES-NI",
+        Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU:          %s (%d) %sx64 %sAES-NI %sASM-%s",
                        Cpu::brand(),
                        Cpu::sockets(),
                        Cpu::isX64() ? "\x1B[01;32m" : "\x1B[01;31m-",
-                       Cpu::hasAES() ? "\x1B[01;32m" : "\x1B[01;31m-");
+                       Cpu::hasAES() ? "\x1B[01;32m" : "\x1B[01;31m-",
+                       Options::i()->asmOptimization() != AsmOptimization::ASM_NONE ? "\x1B[01;32m" : "\x1B[01;31m",
+                       getAsmOptimizationName(Options::i()->asmOptimization()).c_str());
 #       ifndef XMRIG_NO_LIBCPUID
         Log::i()->text("\x1B[01;32m * \x1B[01;37mCPU L2/L3:    %.1f MB/%.1f MB", Cpu::l2() / 1024.0, Cpu::l3() / 1024.0);
 #       endif
     }
     else {
-        Log::i()->text(" * CPU:          %s (%d) %sx64 %sAES-NI", Cpu::brand(), Cpu::sockets(), Cpu::isX64() ? "" : "-", Cpu::hasAES() ? "" : "-");
+        Log::i()->text(" * CPU:          %s (%d) %sx64 %sAES-NI ASM-%s",
+                       Cpu::brand(), Cpu::sockets(), Cpu::isX64() ? "" : "-", Cpu::hasAES() ? "" : "-",
+                       getAsmOptimizationName(Options::i()->asmOptimization()).c_str());
 #       ifndef XMRIG_NO_LIBCPUID
         Log::i()->text(" * CPU L2/L3:    %.1f MB/%.1f MB", Cpu::l2() / 1024.0, Cpu::l3() / 1024.0);
 #       endif
diff --git a/src/config.json b/src/config.json
index f777770c..760cf82a 100644
--- a/src/config.json
+++ b/src/config.json
@@ -4,8 +4,9 @@
     "threads": 0,                               // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
     "multihash-factor": 0,                      // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
     "multihash-thread-mask" : null,             // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
-    "pow-variant" : "auto",                     // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto
+    "pow-variant" : "auto",                     // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto
                                                 // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
+    "asm-optimization" : "auto",                // specificy the ASM optimization to use: -> auto (default), intel, ryzen, none
     "background": false,                        // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
     "colors": true,                             // false to disable colored output
     "cpu-affinity": null,                       // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1
diff --git a/src/crypto/CryptoNight.cpp b/src/crypto/CryptoNight.cpp
index bf9b0b08..374eca45 100644
--- a/src/crypto/CryptoNight.cpp
+++ b/src/crypto/CryptoNight.cpp
@@ -34,28 +34,64 @@
 #include "crypto/CryptoNight_test.h"
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
+static void cryptonight_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
 #   if !defined(XMRIG_ARMv7)
     if (powVersion == PowVariant::POW_V1) {
+#if defined(XMRIG_ARM)
         CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
-    } else if (powVersion == PowVariant::POW_ALLOY) {
-        CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
-    } else if (powVersion == PowVariant::POW_XTL) {
-        CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
-    } else if (powVersion == PowVariant::POW_MSR) {
-        CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
-    } else if (powVersion == PowVariant::POW_RTO) {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
-    }else {
-        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
-    }
+#else
+        if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
+        } else {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
+        }
+#endif
+    } else if (powVersion == PowVariant::POW_V2) {
+#if defined(XMRIG_ARM)
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
+#else
+        if ((asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS <= 2) || (asmOptimization == AsmOptimization::ASM_RYZEN && NUM_HASH_BLOCKS == 1)) {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
+        } else {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
+        }
+#endif
+} else if (powVersion == PowVariant::POW_ALLOY) {
+    CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
+} else if (powVersion == PowVariant::POW_XTL) {
+    CryptoNightMultiHash<0x80000, POW_XLT_V4_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
+} else if (powVersion == PowVariant::POW_MSR) {
+    CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
+} else if (powVersion == PowVariant::POW_RTO) {
+    CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashLiteTube(input, size, output, scratchPad);
+}else {
+    CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
+}
 #   endif
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
+static void cryptonight_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
     if (powVersion == PowVariant::POW_V1) {
+#if defined(XMRIG_ARM)
         CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
+#else
+        if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2_asm(input, size, output, scratchPad, asmOptimization);
+        } else {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
+        }
+#endif
+    } else if (powVersion == PowVariant::POW_V2) {
+#if defined(XMRIG_ARM)
+        CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, false, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
+#else
+        if (asmOptimization == AsmOptimization::ASM_INTEL && NUM_HASH_BLOCKS == 1) {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3_asm(input, size, output, scratchPad, asmOptimization);
+        } else {
+            CryptoNightMultiHash<0x80000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hashPowV3(input, size, output, scratchPad);
+        }
+#endif
     } else if (powVersion == PowVariant::POW_ALLOY) {
         CryptoNightMultiHash<0x100000, POW_DEFAULT_INDEX_SHIFT, MEMORY, 0x1FFFF0, true, NUM_HASH_BLOCKS>::hash(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_XTL) {
@@ -70,7 +106,7 @@ static void cryptonight_softaes(PowVariant powVersion, const uint8_t* input, siz
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
+static void cryptonight_lite_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
 #   if !defined(XMRIG_ARMv7)
     if (powVersion == PowVariant::POW_V1) {
         CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, false, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
@@ -83,7 +119,7 @@ static void cryptonight_lite_aesni(PowVariant powVersion, const uint8_t* input,
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
+static void cryptonight_lite_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
     if (powVersion == PowVariant::POW_V1) {
         CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_LITE, 0xFFFF0, true, NUM_HASH_BLOCKS>::hashPowV2(input, size, output, scratchPad);
     } else if (powVersion == PowVariant::POW_TUBE) {
@@ -94,7 +130,7 @@ static void cryptonight_lite_softaes(PowVariant powVersion, const uint8_t* input
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
+static void cryptonight_heavy_aesni(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
 #   if !defined(XMRIG_ARMv7)
     if (powVersion == PowVariant::POW_XHV) {
         CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, false, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad);
@@ -109,7 +145,7 @@ static void cryptonight_heavy_aesni(PowVariant powVersion, const uint8_t* input,
 }
 
 template <size_t NUM_HASH_BLOCKS>
-static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
+static void cryptonight_heavy_softaes(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad) {
     if (powVersion == PowVariant::POW_XHV) {
         CryptoNightMultiHash<0x40000, POW_DEFAULT_INDEX_SHIFT, MEMORY_HEAVY, 0x3FFFF0, true, NUM_HASH_BLOCKS>::hashHeavyHaven(input, size, output, scratchPad);
     }
@@ -121,7 +157,7 @@ static void cryptonight_heavy_softaes(PowVariant powVersion, const uint8_t* inpu
     }
 }
 
-void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad);
+void (*cryptonight_hash_ctx[MAX_NUM_HASH_BLOCKS])(AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad);
 
 template <size_t HASH_FACTOR>
 void setCryptoNightHashMethods(Options::Algo algo, bool aesni)
@@ -163,13 +199,19 @@ void setCryptoNightHashMethods<0>(Options::Algo algo, bool aesni)
 
 bool CryptoNight::init(int algo, bool aesni)
 {
+    for (int i = 0; i < 256; ++i)
+    {
+        const uint64_t index = (((i >> 3) & 6) | (i & 1)) << 1;
+        variant1_table[i] = i ^ ((0x75310 >> index) & 0x30);
+    }
+
     setCryptoNightHashMethods<MAX_NUM_HASH_BLOCKS>(static_cast<Options::Algo>(algo), aesni);
     return selfTest(algo);
 }
 
-void CryptoNight::hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad)
+void CryptoNight::hash(size_t factor, AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPad)
 {
-    cryptonight_hash_ctx[factor-1](powVersion, input, size, output, scratchPad);
+    cryptonight_hash_ctx[factor-1](asmOptimization, powVersion, input, size, output, scratchPad);
 }
 
 bool CryptoNight::selfTest(int algo)
@@ -206,203 +248,231 @@ bool CryptoNight::selfTest(int algo)
     bool resultLite = true;
     bool resultHeavy = true;
 
+    AsmOptimization asmOptimization = Options::i()->asmOptimization();
+
     if (algo == Options::ALGO_CRYPTONIGHT_HEAVY) {
         // cn-heavy
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy, 96) == 0;
         #endif
 
         // cn-heavy haven
 
-        cryptonight_hash_ctx[0](PowVariant::POW_XHV, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_XHV, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_XHV, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XHV, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_haven, 96) == 0;
         #endif
 
         // cn-heavy bittube
 
-        cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultHeavy = resultHeavy && memcmp(output, test_output_heavy_tube, 96) == 0;
         #endif
+
     } else if (algo == Options::ALGO_CRYPTONIGHT_LITE) {
         // cn-lite v0
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v0_lite, 160) == 0;
         #endif
 
         // cn-lite v7 tests
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_v1_lite, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output,  test_output_v1_lite, 160) == 0;
         #endif
 
 
         // cn-lite ibpc tests
 
-        cryptonight_hash_ctx[0](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_TUBE, test_input, 76, output, scratchPads);
         resultLite = resultLite && memcmp(output, test_output_ipbc_lite, 160) == 0;
         #endif
 
     } else {
-        // cn v0
+        // cn v0 aka orignal
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V0,test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V0,test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V0, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V0, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v0, 160) == 0;
         #endif
 
-        // cn v7
+        // cn v7 aka cnv1
 
-        cryptonight_hash_ctx[0](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_V1, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V1, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_v1, 160) == 0;
         #endif
 
-        // cn xtl
+        // cn v7 + xtl
 
-        cryptonight_hash_ctx[0](PowVariant::POW_XTL,test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_XTL,test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 32) == 0;
 
         #if MAX_NUM_HASH_BLOCKS > 1
-        cryptonight_hash_ctx[1](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 64) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 2
-        cryptonight_hash_ctx[2](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 96) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 3
-        cryptonight_hash_ctx[3](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 128) == 0;
         #endif
 
         #if MAX_NUM_HASH_BLOCKS > 4
-        cryptonight_hash_ctx[4](PowVariant::POW_XTL, test_input, 76, output, scratchPads);
+        cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_XTL, test_input, 76, output, scratchPads);
         result = result && memcmp(output, test_output_xtl, 160) == 0;
         #endif
+
+        // cn v8 aka cnv2
+
+        cryptonight_hash_ctx[0](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads);
+        result = result && memcmp(output, test_output_v2, 32) == 0;
+
+        #if MAX_NUM_HASH_BLOCKS > 1
+        cryptonight_hash_ctx[1](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads);
+        result = result && memcmp(output, test_output_v2, 64) == 0;
+        #endif
+
+        #if MAX_NUM_HASH_BLOCKS > 2
+        cryptonight_hash_ctx[2](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads);
+        result = result && memcmp(output, test_output_v2, 96) == 0;
+        #endif
+
+        #if MAX_NUM_HASH_BLOCKS > 3
+        cryptonight_hash_ctx[3](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads);
+        result = result && memcmp(output, test_output_v2, 128) == 0;
+        #endif
+
+        #if MAX_NUM_HASH_BLOCKS > 4
+        cryptonight_hash_ctx[4](asmOptimization, PowVariant::POW_V2, test_input, 76, output, scratchPads);
+        result = result && memcmp(output, test_output_v2, 160) == 0;
+        #endif
     }
 
     for (size_t i = 0; i < MAX_NUM_HASH_BLOCKS; ++i) {
diff --git a/src/crypto/CryptoNight.h b/src/crypto/CryptoNight.h
index 753c56fc..1275d669 100644
--- a/src/crypto/CryptoNight.h
+++ b/src/crypto/CryptoNight.h
@@ -25,9 +25,10 @@
 #define __CRYPTONIGHT_H__
 
 
-#include <cstddef>
-#include <cstdint>
+#include <stddef.h>
+#include <stdint.h>
 
+#include "AsmOptimization.h"
 #include "Options.h"
 
 #define MEMORY       2097152 /* 2 MiB */
@@ -38,10 +39,17 @@
 #define POW_XLT_V4_INDEX_SHIFT 4
 
 struct ScratchPad {
-    alignas(16) uint8_t state[208]; // 208 instead of 200 to maintain aligned to 16 byte boundaries
+    alignas(16) uint8_t state[224]; // 224 instead of 200 to maintain aligned to 16 byte boundaries
     alignas(16) uint8_t* memory;
+
+    // Additional stuff for asm impl
+    uint8_t ctx_info[24];
+    const void* input;
+    uint8_t* variant1_table;
+    const uint32_t* t_fn;
 };
 
+alignas(64) static uint8_t variant1_table[256];
 
 class Job;
 class JobResult;
@@ -50,8 +58,9 @@ class CryptoNight
 {
 public:
     static bool init(int algo, bool aesni);
+    static void hash(size_t factor, AsmOptimization asmOptimization, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads);
 
-    static void hash(size_t factor, PowVariant powVersion, const uint8_t* input, size_t size, uint8_t* output, ScratchPad** scratchPads);
+public:
 
 private:
     static bool selfTest(int algo);
diff --git a/src/crypto/CryptoNight_arm.h b/src/crypto/CryptoNight_arm.h
index 377c0002..9df769fb 100644
--- a/src/crypto/CryptoNight_arm.h
+++ b/src/crypto/CryptoNight_arm.h
@@ -36,6 +36,7 @@
 
 #endif
 
+#include <math.h>
 #include <signal.h>
 
 #include "crypto/CryptoNight.h"
@@ -110,6 +111,44 @@ static inline __attribute__((always_inline)) uint64_t _mm_cvtsi128_si64(__m128i
 #define EXTRACT64(X) _mm_cvtsi128_si64(X)
 
 
+# define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax) \
+{ \
+    const uint64x2_t chunk1 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x10))); \
+    const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x20))); \
+    const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x30))); \
+    vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx1))); \
+    vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx0))); \
+    vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(ax))); \
+}
+
+# define INTEGER_MATH_V2(idx, cl, cx) \
+{ \
+    const uint64_t cx_0 = _mm_cvtsi128_si64(cx); \
+    cl ^= division_result_xmm##idx ^ (sqrt_result##idx << 32); \
+    const uint32_t d = static_cast<uint32_t>(cx_0 + (sqrt_result##idx << 1)) | 0x80000001UL; \
+    const uint64_t cx_1 = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+    division_result_xmm##idx = static_cast<uint32_t>(cx_1 / d) + ((cx_1 % d) << 32); \
+    const uint64_t sqrt_input = cx_0 + division_result_xmm##idx; \
+    sqrt_result##idx = sqrt(sqrt_input + 18446744073709551616.0) * 2.0 - 8589934592.0; \
+    const uint64_t s = sqrt_result##idx >> 1; \
+    const uint64_t b = sqrt_result##idx & 1; \
+    const uint64_t r2 = (uint64_t)(s) * (s + b) + (sqrt_result##idx << 32); \
+    sqrt_result##idx += ((r2 + b > sqrt_input) ? -1 : 0) + ((r2 + (1ULL << 32) < sqrt_input - s) ? 1 : 0); \
+}
+
+# define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi) \
+{ \
+    const uint64x2_t chunk1 = veorq_u64(vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x10))), vcombine_u64(vcreate_u64(hi), vcreate_u64(lo))); \
+    const uint64x2_t chunk2 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x20))); \
+    const uint64x2_t chunk3 = vld1q_u64((uint64_t*)((l) + ((idx) ^ 0x30))); \
+    hi ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[0]; \
+    lo ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[1]; \
+    vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x10)), vaddq_u64(chunk3, vreinterpretq_u64_u8(bx1))); \
+    vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x20)), vaddq_u64(chunk1, vreinterpretq_u64_u8(bx0))); \
+    vst1q_u64((uint64_t*)((l) + ((idx) ^ 0x30)), vaddq_u64(chunk2, vreinterpretq_u64_u8(ax))); \
+}
+
+
 #if defined (__arm64__) || defined (__aarch64__)
 static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi)
 {
@@ -121,23 +160,17 @@ static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi)
 
 static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uint64_t* product_hi)
 {
-    // multiplier   = ab = a * 2^32 + b
-    // multiplicand = cd = c * 2^32 + d
-    // ab * cd = a * c * 2^64 + (a * d + b * c) * 2^32 + b * d
     uint64_t a = multiplier >> 32;
     uint64_t b = multiplier & 0xFFFFFFFF;
     uint64_t c = multiplicand >> 32;
     uint64_t d = multiplicand & 0xFFFFFFFF;
 
-    //uint64_t ac = a * c;
     uint64_t ad = a * d;
-    //uint64_t bc = b * c;
     uint64_t bd = b * d;
 
     uint64_t adbc = ad + (b * c);
     uint64_t adbc_carry = adbc < ad ? 1 : 0;
 
-    // multiplier * multiplicand = product_hi * 2^64 + product_lo
     uint64_t product_lo = bd + (adbc << 32);
     uint64_t product_lo_carry = product_lo < bd ? 1 : 0;
     *product_hi = (a * c) + (adbc >> 32) + (adbc_carry << 32) + product_lo_carry;
@@ -621,8 +654,10 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size,
@@ -643,23 +678,27 @@ public:
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK],
-                                     _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
                 } else {
-                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
-                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
                 }
+            }
 
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
-                                _mm_xor_si128(bx[hashBlock], cx));
+                                _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
-                uint64_t hi, lo, cl, ch;
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
                 ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
                 lo = __umul128(idx[hashBlock], cl, &hi);
@@ -673,6 +712,8 @@ public:
                 ah[hashBlock] ^= ch;
                 al[hashBlock] ^= cl;
                 idx[hashBlock] = al[hashBlock];
+
+                bx[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -693,9 +734,11 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state,
@@ -712,37 +755,42 @@ public:
 
             al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
             ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5];
-            bx[hashBlock] =
-                    _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
+            bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
             idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
         }
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK],
-                                     _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
                 } else {
-                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
-                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
                 }
+            }
 
-                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
-                                _mm_xor_si128(bx[hashBlock], cx));
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK],
+                                _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                const uint8_t tmp = reinterpret_cast<const uint8_t *>(&l[hashBlock][idx[hashBlock] & MASK])[11];
                 static const uint32_t table = 0x75310;
                 const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-                ((uint8_t*) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+                ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
-                uint64_t hi, lo, cl, ch;
-                cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
-                ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                cl = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0];
+                ch = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1];
                 lo = __umul128(idx[hashBlock], cl, &hi);
 
                 al[hashBlock] += hi;
@@ -750,14 +798,123 @@ public:
 
                 ah[hashBlock] ^= tweak1_2[hashBlock];
 
-                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock];
-                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock];
+                ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock];
+                ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock];
 
                 ah[hashBlock] ^= tweak1_2[hashBlock];
 
                 ah[hashBlock] ^= ch;
                 al[hashBlock] ^= cl;
                 idx[hashBlock] = al[hashBlock];
+
+                bx[hashBlock] = cx[hashBlock];
+            }
+        }
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
+            keccakf(h[hashBlock], 24);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
+        }
+    }
+
+    // multi
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l[NUM_HASH_BLOCKS];
+        uint64_t* h[NUM_HASH_BLOCKS];
+        uint64_t al[NUM_HASH_BLOCKS];
+        uint64_t ah[NUM_HASH_BLOCKS];
+        uint64_t idx[NUM_HASH_BLOCKS];
+        uint64_t sqrt_result[NUM_HASH_BLOCKS];
+        uint64_t division_result_xmm[NUM_HASH_BLOCKS];
+        __m128i bx0[NUM_HASH_BLOCKS];
+        __m128i bx1[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size,
+                   scratchPad[hashBlock]->state, 200);
+        }
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
+
+            cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
+
+            al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+            ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5];
+            bx0[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
+            bx1[hashBlock] = _mm_set_epi64x(h[hashBlock][9] ^ h[hashBlock][11], h[hashBlock][8] ^ h[hashBlock][10]);
+            idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+
+            division_result_xmm[hashBlock] = h[hashBlock][12];
+            sqrt_result[hashBlock] = h[hashBlock][13];
+        }
+
+        uint64_t sqrt_result0;
+        uint64_t division_result_xmm0;
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
+
+                if (SOFT_AES) {
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
+                } else {
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
+                }
+            }
+
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                SHUFFLE_PHASE_1(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock])
+            }
+
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
+                                _mm_xor_si128(bx0[hashBlock], cx[hashBlock]));
+            }
+
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
+
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
+                ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
+
+                sqrt_result0 = sqrt_result[hashBlock];
+                division_result_xmm0 = division_result_xmm[hashBlock];
+
+                INTEGER_MATH_V2(0, cl, cx[hashBlock])
+
+                sqrt_result[hashBlock] = sqrt_result0;
+                division_result_xmm[hashBlock] = division_result_xmm0;
+
+                lo = __umul128(idx[hashBlock], cl, &hi);
+
+                SHUFFLE_PHASE_2(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock], lo, hi)
+
+                al[hashBlock] += hi;
+                ah[hashBlock] += lo;
+
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock];
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock];
+
+                ah[hashBlock] ^= ch;
+                al[hashBlock] ^= cl;
+                idx[hashBlock] = al[hashBlock];
+
+                bx1[hashBlock] = bx0[hashBlock];
+                bx0[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -1271,6 +1428,79 @@ public:
         extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
+    // single
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l;
+        uint64_t* h;
+        uint64_t al;
+        uint64_t ah;
+        uint64_t idx;
+        __m128i bx0;
+        __m128i bx1;
+
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
+
+        l = scratchPad[0]->memory;
+        h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+
+        al = h[0] ^ h[4];
+        ah = h[1] ^ h[5];
+        bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
+        bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]);
+        idx = h[0] ^ h[4];
+
+        uint64_t division_result_xmm0 = h[12];
+        uint64_t sqrt_result0 =  h[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            const __m128i ax = _mm_set_epi64x(ah, al);
+
+            __m128i cx;
+            if (SOFT_AES) {
+                cx = soft_aesenc((uint32_t*) &l[idx & MASK], ax);
+            } else {
+                cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+                cx = _mm_aesenc_si128(cx, ax);
+            }
+
+            SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax)
+
+            _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx));
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l[idx & MASK])[0];
+            ch = ((uint64_t*) &l[idx & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx)
+
+            lo = __umul128(idx, cl, &hi);
+
+            SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi)
+
+            al += hi;
+            ah += lo;
+
+            ((uint64_t*) &l[idx & MASK])[0] = al;
+            ((uint64_t*) &l[idx & MASK])[1] = ah;
+
+            ah ^= ch;
+            al ^= cl;
+            idx = al;
+
+            bx0 = cx;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        keccakf(h, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                     size_t size,
                                     uint8_t* __restrict__ output,
@@ -1580,6 +1810,7 @@ public:
     }
 };
 
+
 template<size_t ITERATIONS, size_t INDEX_SHIFT, size_t MEM, size_t MASK, bool SOFT_AES>
 class CryptoNightMultiHash<ITERATIONS, INDEX_SHIFT, MEM, MASK, SOFT_AES, 2>
 {
@@ -1783,6 +2014,128 @@ public:
         extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
+    // double
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+
+        uint64_t division_result_xmm0 = h0[12];
+        uint64_t division_result_xmm1 = h1[12];
+
+        uint64_t sqrt_result0 = h0[13];
+        uint64_t sqrt_result1 = h1[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+            __m128i cx0;
+            __m128i cx1;
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, ax0);
+                cx1 = _mm_aesenc_si128(cx1, ax1);
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx0);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+            INTEGER_MATH_V2(1, cl, cx1);
+
+            lo = __umul128(idx1, cl, &hi);
+
+            SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            bx11 = bx01;
+            bx01 = cx1;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                     size_t size,
                                     uint8_t* __restrict__ output,
@@ -2565,6 +2918,172 @@ public:
         extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
+    // triple
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+        __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+
+        uint64_t division_result_xmm0 = h0[12];
+        uint64_t division_result_xmm1 = h1[12];
+        uint64_t division_result_xmm2 = h2[12];
+
+        uint64_t sqrt_result0 =  h0[13];
+        uint64_t sqrt_result1 =  h1[13];
+        uint64_t sqrt_result2 =  h2[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+            const __m128i ax2 = _mm_set_epi64x(ah2, al2);
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1);
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1)
+            SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx0);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+            INTEGER_MATH_V2(1, cl, cx1);
+
+            lo = __umul128(idx1, cl, &hi);
+
+            SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            bx11 = bx01;
+            bx01 = cx1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+
+            INTEGER_MATH_V2(2, cl, cx2);
+
+            lo = __umul128(idx2, cl, &hi);
+
+            SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi)
+
+            al2 += hi;
+            ah2 += lo;
+
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            bx12 = bx02;
+            bx02 = cx2;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                     size_t size,
                                     uint8_t* __restrict__ output,
@@ -3617,6 +4136,217 @@ public:
         extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
     }
 
+    // quadruple
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+        __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]);
+        __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+
+        uint64_t division_result_xmm0 = h0[12];
+        uint64_t division_result_xmm1 = h1[12];
+        uint64_t division_result_xmm2 = h2[12];
+        uint64_t division_result_xmm3 = h3[12];
+
+        uint64_t sqrt_result0 =  h0[13];
+        uint64_t sqrt_result1 =  h1[13];
+        uint64_t sqrt_result2 =  h2[13];
+        uint64_t sqrt_result3 =  h3[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+            const __m128i ax2 = _mm_set_epi64x(ah2, al2);
+            const __m128i ax3 = _mm_set_epi64x(ah3, al3);
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1);
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2);
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], ax3);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, ax0);
+                cx1 = _mm_aesenc_si128(cx1, ax1);
+                cx2 = _mm_aesenc_si128(cx2, ax2);
+                cx3 = _mm_aesenc_si128(cx3, ax3);
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1)
+            SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2)
+            SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx0);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+            INTEGER_MATH_V2(1, cl, cx1);
+
+            lo = __umul128(idx1, cl, &hi);
+
+            SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            bx11 = bx01;
+            bx01 = cx1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+
+            INTEGER_MATH_V2(2, cl, cx2);
+
+            lo = __umul128(idx2, cl, &hi);
+
+            SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            bx12 = bx02;
+            bx02 = cx2;
+
+
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
+
+            INTEGER_MATH_V2(3, cl, cx3);
+
+            lo = __umul128(idx3, cl, &hi);
+
+            SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+
+            bx13 = bx03;
+            bx03 = cx3;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                     size_t size,
                                     uint8_t* __restrict__ output,
@@ -4265,6 +4995,262 @@ public:
         extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
     }
 
+    // quintuple
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(input, (int) size, scratchPad[0]->state, 200);
+        keccak(input + size, (int) size, scratchPad[1]->state, 200);
+        keccak(input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak(input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak(input + 4 * size, (int) size, scratchPad[4]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h4, (__m128i*) l4);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+        uint64_t al4 = h4[0] ^h4[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+        uint64_t ah4 = h4[1] ^h4[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+        __m128i bx04 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+        __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]);
+        __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]);
+        __m128i bx14 = _mm_set_epi64x(h4[9] ^ h4[11], h4[8] ^ h4[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+        uint64_t idx4 = h4[0] ^h4[4];
+
+        uint64_t division_result_xmm0 = h0[12];
+        uint64_t division_result_xmm1 = h1[12];
+        uint64_t division_result_xmm2 = h2[12];
+        uint64_t division_result_xmm3 = h3[12];
+        uint64_t division_result_xmm4 = h4[12];
+
+        uint64_t sqrt_result0 =  h0[13];
+        uint64_t sqrt_result1 =  h1[13];
+        uint64_t sqrt_result2 =  h2[13];
+        uint64_t sqrt_result3 =  h3[13];
+        uint64_t sqrt_result4 =  h4[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+            __m128i cx4;
+
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+            const __m128i ax2 = _mm_set_epi64x(ah2, al2);
+            const __m128i ax3 = _mm_set_epi64x(ah3, al3);
+            const __m128i ax4 = _mm_set_epi64x(ah4, al4);
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*) &l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*) &l1[idx1 & MASK], ax1);
+                cx2 = soft_aesenc((uint32_t*) &l2[idx2 & MASK], ax2);
+                cx3 = soft_aesenc((uint32_t*) &l3[idx3 & MASK], ax3);
+                cx4 = soft_aesenc((uint32_t*) &l4[idx4 & MASK], ax4);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+                cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, ax0);
+                cx1 = _mm_aesenc_si128(cx1, ax1);
+                cx2 = _mm_aesenc_si128(cx2, ax2);
+                cx3 = _mm_aesenc_si128(cx3, ax3);
+                cx4 = _mm_aesenc_si128(cx4, ax4);
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1)
+            SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2)
+            SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3)
+            SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3));
+            _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx04, cx4));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+            idx4 = EXTRACT64(cx4);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx0);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+            INTEGER_MATH_V2(1, cl, cx1);
+
+            lo = __umul128(idx1, cl, &hi);
+
+            SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            bx11 = bx01;
+            bx01 = cx1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+
+            INTEGER_MATH_V2(2, cl, cx2);
+
+            lo = __umul128(idx2, cl, &hi);
+
+            SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            bx12 = bx02;
+            bx02 = cx2;
+
+
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
+
+            INTEGER_MATH_V2(3, cl, cx3);
+
+            lo = __umul128(idx3, cl, &hi);
+
+            SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+
+            bx13 = bx03;
+            bx03 = cx3;
+
+
+            cl = ((uint64_t*) &l4[idx4 & MASK])[0];
+            ch = ((uint64_t*) &l4[idx4 & MASK])[1];
+
+            INTEGER_MATH_V2(4, cl, cx4);
+
+            lo = __umul128(idx4, cl, &hi);
+
+            SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi);
+
+            al4 += hi;
+            ah4 += lo;
+
+            ((uint64_t*) &l4[idx4 & MASK])[0] = al4;
+            ((uint64_t*) &l4[idx4 & MASK])[1] = ah4;
+
+            ah4 ^= ch;
+            al4 ^= cl;
+            idx4 = al4;
+
+            bx14 = bx04;
+            bx04 = cx4;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l4, (__m128i*) h4);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+        keccakf(h4, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                     size_t size,
                                     uint8_t* __restrict__ output,
diff --git a/src/crypto/CryptoNight_test.h b/src/crypto/CryptoNight_test.h
index 0aee57b3..2952f140 100644
--- a/src/crypto/CryptoNight_test.h
+++ b/src/crypto/CryptoNight_test.h
@@ -26,155 +26,169 @@
 #define __CRYPTONIGHT_TEST_H__
 
 const static uint8_t test_input[380] = {
-		0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
-		0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
-		0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
-		0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
-		0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
-		0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
-		0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
-		0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
-		0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
-		0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
-		0x07, 0x07, 0xB4, 0x87, 0xD0, 0xD6, 0x05, 0x26, 0xE0, 0xC6, 0xDD, 0x9B, 0xC7, 0x18, 0xC3, 0xCF,
-		0x52, 0x04, 0xBD, 0x4F, 0x9B, 0x27, 0xF6, 0x73, 0xB9, 0x3F, 0xEF, 0x7B, 0xB2, 0xF7, 0x2B, 0xBB,
-		0x3F, 0x3E, 0x9C, 0x3E, 0x9D, 0x33, 0x1E, 0xDE, 0xAD, 0xBE, 0xEF, 0x4E, 0x00, 0x91, 0x81, 0x29,
-		0x74, 0xB2, 0x70, 0xE7, 0x6D, 0xD2, 0x2A, 0x5F, 0x52, 0x04, 0x93, 0xE6, 0x18, 0x89, 0x40, 0xD8,
-		0xC6, 0xE3, 0x90, 0x6E, 0xAA, 0x6A, 0xB7, 0xE2, 0x08, 0x7E, 0x78, 0x0E,
-		0x01, 0x00, 0xEE, 0xB2, 0xD1, 0xD6, 0x05, 0xFF, 0x27, 0x7F, 0x26, 0xDB, 0xAA, 0xB2, 0xC9, 0x26,
-		0x30, 0xC6, 0xCF, 0x11, 0x64, 0xEA, 0x6C, 0x8A, 0xE0, 0x98, 0x01, 0xF8, 0x75, 0x4B, 0x49, 0xAF,
-		0x79, 0x70, 0xAE, 0xEE, 0xA7, 0x62, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x47, 0x8C, 0x63, 0xE7, 0xD8,
-		0x40, 0x02, 0x3C, 0xDA, 0xEA, 0x92, 0x52, 0x53, 0xAC, 0xFD, 0xC7, 0x8A, 0x4C, 0x31, 0xB2, 0xF2,
-		0xEC, 0x72, 0x7B, 0xFF, 0xCE, 0xC0, 0xE7, 0x12, 0xD4, 0xE9, 0x2A, 0x01,
-		0x07, 0x07, 0xA9, 0xB7, 0xD1, 0xD6, 0x05, 0x3F, 0x0D, 0x5E, 0xFD, 0xC7, 0x03, 0xFC, 0xFC, 0xD2,
-		0xCE, 0xBC, 0x44, 0xD8, 0xAB, 0x44, 0xA6, 0xA0, 0x3A, 0xE4, 0x4D, 0x8F, 0x15, 0xAF, 0x62, 0x17,
-		0xD1, 0xE0, 0x92, 0x85, 0xE4, 0x73, 0xF9, 0x00, 0x00, 0x00, 0xA0, 0xFC, 0x09, 0xDE, 0xAB, 0xF5,
-		0x8B, 0x6F, 0x1D, 0xCA, 0xA8, 0xBA, 0xAC, 0x74, 0xDD, 0x74, 0x19, 0xD5, 0xD6, 0x10, 0xEC, 0x38,
-		0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04
+	0x03, 0x05, 0xA0, 0xDB, 0xD6, 0xBF, 0x05, 0xCF, 0x16, 0xE5, 0x03, 0xF3, 0xA6, 0x6F, 0x78, 0x00,
+	0x7C, 0xBF, 0x34, 0x14, 0x43, 0x32, 0xEC, 0xBF, 0xC2, 0x2E, 0xD9, 0x5C, 0x87, 0x00, 0x38, 0x3B,
+	0x30, 0x9A, 0xCE, 0x19, 0x23, 0xA0, 0x96, 0x4B, 0x00, 0x00, 0x00, 0x08, 0xBA, 0x93, 0x9A, 0x62,
+	0x72, 0x4C, 0x0D, 0x75, 0x81, 0xFC, 0xE5, 0x76, 0x1E, 0x9D, 0x8A, 0x0E, 0x6A, 0x1C, 0x3F, 0x92,
+	0x4F, 0xDD, 0x84, 0x93, 0xD1, 0x11, 0x56, 0x49, 0xC0, 0x5E, 0xB6, 0x01,
+	0x01, 0x00, 0xFB, 0x8E, 0x8A, 0xC8, 0x05, 0x89, 0x93, 0x23, 0x37, 0x1B, 0xB7, 0x90, 0xDB, 0x19,
+	0x21, 0x8A, 0xFD, 0x8D, 0xB8, 0xE3, 0x75, 0x5D, 0x8B, 0x90, 0xF3, 0x9B, 0x3D, 0x55, 0x06, 0xA9,
+	0xAB, 0xCE, 0x4F, 0xA9, 0x12, 0x24, 0x45, 0x00, 0x00, 0x00, 0x00, 0xEE, 0x81, 0x46, 0xD4, 0x9F,
+	0xA9, 0x3E, 0xE7, 0x24, 0xDE, 0xB5, 0x7D, 0x12, 0xCB, 0xC6, 0xC6, 0xF3, 0xB9, 0x24, 0xD9, 0x46,
+	0x12, 0x7C, 0x7A, 0x97, 0x41, 0x8F, 0x93, 0x48, 0x82, 0x8F, 0x0F, 0x02,
+	0x07, 0x07, 0xB4, 0x87, 0xD0, 0xD6, 0x05, 0x26, 0xE0, 0xC6, 0xDD, 0x9B, 0xC7, 0x18, 0xC3, 0xCF,
+	0x52, 0x04, 0xBD, 0x4F, 0x9B, 0x27, 0xF6, 0x73, 0xB9, 0x3F, 0xEF, 0x7B, 0xB2, 0xF7, 0x2B, 0xBB,
+	0x3F, 0x3E, 0x9C, 0x3E, 0x9D, 0x33, 0x1E, 0xDE, 0xAD, 0xBE, 0xEF, 0x4E, 0x00, 0x91, 0x81, 0x29,
+	0x74, 0xB2, 0x70, 0xE7, 0x6D, 0xD2, 0x2A, 0x5F, 0x52, 0x04, 0x93, 0xE6, 0x18, 0x89, 0x40, 0xD8,
+	0xC6, 0xE3, 0x90, 0x6E, 0xAA, 0x6A, 0xB7, 0xE2, 0x08, 0x7E, 0x78, 0x0E,
+	0x01, 0x00, 0xEE, 0xB2, 0xD1, 0xD6, 0x05, 0xFF, 0x27, 0x7F, 0x26, 0xDB, 0xAA, 0xB2, 0xC9, 0x26,
+	0x30, 0xC6, 0xCF, 0x11, 0x64, 0xEA, 0x6C, 0x8A, 0xE0, 0x98, 0x01, 0xF8, 0x75, 0x4B, 0x49, 0xAF,
+	0x79, 0x70, 0xAE, 0xEE, 0xA7, 0x62, 0x2C, 0x00, 0x00, 0x00, 0x00, 0x47, 0x8C, 0x63, 0xE7, 0xD8,
+	0x40, 0x02, 0x3C, 0xDA, 0xEA, 0x92, 0x52, 0x53, 0xAC, 0xFD, 0xC7, 0x8A, 0x4C, 0x31, 0xB2, 0xF2,
+	0xEC, 0x72, 0x7B, 0xFF, 0xCE, 0xC0, 0xE7, 0x12, 0xD4, 0xE9, 0x2A, 0x01,
+	0x07, 0x07, 0xA9, 0xB7, 0xD1, 0xD6, 0x05, 0x3F, 0x0D, 0x5E, 0xFD, 0xC7, 0x03, 0xFC, 0xFC, 0xD2,
+	0xCE, 0xBC, 0x44, 0xD8, 0xAB, 0x44, 0xA6, 0xA0, 0x3A, 0xE4, 0x4D, 0x8F, 0x15, 0xAF, 0x62, 0x17,
+	0xD1, 0xE0, 0x92, 0x85, 0xE4, 0x73, 0xF9, 0x00, 0x00, 0x00, 0xA0, 0xFC, 0x09, 0xDE, 0xAB, 0xF5,
+	0x8B, 0x6F, 0x1D, 0xCA, 0xA8, 0xBA, 0xAC, 0x74, 0xDD, 0x74, 0x19, 0xD5, 0xD6, 0x10, 0xEC, 0x38,
+	0xCF, 0x50, 0x29, 0x6A, 0x07, 0x0B, 0x93, 0x8F, 0x8F, 0xA8, 0x10, 0x04
 };
 
 
 // CN
 const static uint8_t test_output_v0[160] = {
-		0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
-		0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
-		0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
-		0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F,
-		0xA1, 0xB4, 0xFA, 0xE3, 0xE5, 0x76, 0xCE, 0xCF, 0xB7, 0x9C, 0xAF, 0x3E, 0x29, 0x92, 0xE4, 0xE0,
-		0x31, 0x24, 0x05, 0x48, 0xBF, 0x8D, 0x5F, 0x7B, 0x11, 0x03, 0x60, 0xAA, 0xD7, 0x50, 0x3F, 0x0C,
-		0x2D, 0x30, 0xF3, 0x87, 0x4F, 0x86, 0xA1, 0x4A, 0xB5, 0xA2, 0x1A, 0x08, 0xD0, 0x44, 0x2C, 0x9D,
-		0x16, 0xE9, 0x28, 0x49, 0xA1, 0xFF, 0x85, 0x6F, 0x12, 0xBB, 0x7D, 0xAB, 0x11, 0x1C, 0xE7, 0xF7,
-		0x2D, 0x9D, 0x19, 0xE4, 0xD2, 0x26, 0x44, 0x1E, 0xCD, 0x22, 0x08, 0x24, 0xA8, 0x97, 0x46, 0x62,
-		0x04, 0x84, 0x90, 0x4A, 0xEE, 0x99, 0x14, 0xED, 0xB8, 0xC6, 0x0D, 0x37, 0xA1, 0x66, 0x17, 0xB0
+	0x1A, 0x3F, 0xFB, 0xEE, 0x90, 0x9B, 0x42, 0x0D, 0x91, 0xF7, 0xBE, 0x6E, 0x5F, 0xB5, 0x6D, 0xB7,
+	0x1B, 0x31, 0x10, 0xD8, 0x86, 0x01, 0x1E, 0x87, 0x7E, 0xE5, 0x78, 0x6A, 0xFD, 0x08, 0x01, 0x00,
+	0x1B, 0x60, 0x6A, 0x3F, 0x4A, 0x07, 0xD6, 0x48, 0x9A, 0x1B, 0xCD, 0x07, 0x69, 0x7B, 0xD1, 0x66,
+	0x96, 0xB6, 0x1C, 0x8A, 0xE9, 0x82, 0xF6, 0x1A, 0x90, 0x16, 0x0F, 0x4E, 0x52, 0x82, 0x8A, 0x7F,
+	0xA1, 0xB4, 0xFA, 0xE3, 0xE5, 0x76, 0xCE, 0xCF, 0xB7, 0x9C, 0xAF, 0x3E, 0x29, 0x92, 0xE4, 0xE0,
+	0x31, 0x24, 0x05, 0x48, 0xBF, 0x8D, 0x5F, 0x7B, 0x11, 0x03, 0x60, 0xAA, 0xD7, 0x50, 0x3F, 0x0C,
+	0x2D, 0x30, 0xF3, 0x87, 0x4F, 0x86, 0xA1, 0x4A, 0xB5, 0xA2, 0x1A, 0x08, 0xD0, 0x44, 0x2C, 0x9D,
+	0x16, 0xE9, 0x28, 0x49, 0xA1, 0xFF, 0x85, 0x6F, 0x12, 0xBB, 0x7D, 0xAB, 0x11, 0x1C, 0xE7, 0xF7,
+	0x2D, 0x9D, 0x19, 0xE4, 0xD2, 0x26, 0x44, 0x1E, 0xCD, 0x22, 0x08, 0x24, 0xA8, 0x97, 0x46, 0x62,
+	0x04, 0x84, 0x90, 0x4A, 0xEE, 0x99, 0x14, 0xED, 0xB8, 0xC6, 0x0D, 0x37, 0xA1, 0x66, 0x17, 0xB0
 };
 
 
 // CN v7
 const static uint8_t test_output_v1[160] = {
-		0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
-		0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
-		0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
-		0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22,
-		0xE7, 0x8C, 0x5A, 0x6E, 0x38, 0x30, 0x68, 0x4A, 0x73, 0xFC, 0x1B, 0xC6, 0x6D, 0xFC, 0x8D, 0x98,
-		0xB4, 0xC2, 0x23, 0x39, 0xAD, 0xE0, 0x9D, 0xF6, 0x6D, 0x8C, 0x6A, 0xAA, 0xF9, 0xB2, 0xE3, 0x4C,
-		0xB6, 0x90, 0x6C, 0xE6, 0x15, 0x5E, 0x46, 0x07, 0x9C, 0xB2, 0x6B, 0xAC, 0x3B, 0xAC, 0x1A, 0xDE,
-		0x92, 0x2C, 0xD6, 0x0C, 0x46, 0x9D, 0x9B, 0xC2, 0x84, 0x52, 0x65, 0xF6, 0xBD, 0xFA, 0x0D, 0x74,
-		0x00, 0x66, 0x10, 0x07, 0xF1, 0x19, 0x06, 0x3A, 0x6C, 0xFF, 0xEE, 0xB2, 0x40, 0xE5, 0x88, 0x2B,
-		0x6C, 0xAB, 0x6B, 0x1D, 0x88, 0xB8, 0x44, 0x25, 0xF4, 0xEA, 0xB7, 0xEC, 0xBA, 0x12, 0x8A, 0x24
+	0xF2, 0x2D, 0x3D, 0x62, 0x03, 0xD2, 0xA0, 0x8B, 0x41, 0xD9, 0x02, 0x72, 0x78, 0xD8, 0xBC, 0xC9,
+	0x83, 0xAC, 0xAD, 0xA9, 0xB6, 0x8E, 0x52, 0xE3, 0xC6, 0x89, 0x69, 0x2A, 0x50, 0xE9, 0x21, 0xD9,
+	0xC9, 0xFA, 0xE8, 0x42, 0x5D, 0x86, 0x88, 0xDC, 0x23, 0x6B, 0xCD, 0xBC, 0x42, 0xFD, 0xB4, 0x2D,
+	0x37, 0x6C, 0x6E, 0xC1, 0x90, 0x50, 0x1A, 0xA8, 0x4B, 0x04, 0xA4, 0xB4, 0xCF, 0x1E, 0xE1, 0x22,
+	0xE7, 0x8C, 0x5A, 0x6E, 0x38, 0x30, 0x68, 0x4A, 0x73, 0xFC, 0x1B, 0xC6, 0x6D, 0xFC, 0x8D, 0x98,
+	0xB4, 0xC2, 0x23, 0x39, 0xAD, 0xE0, 0x9D, 0xF6, 0x6D, 0x8C, 0x6A, 0xAA, 0xF9, 0xB2, 0xE3, 0x4C,
+	0xB6, 0x90, 0x6C, 0xE6, 0x15, 0x5E, 0x46, 0x07, 0x9C, 0xB2, 0x6B, 0xAC, 0x3B, 0xAC, 0x1A, 0xDE,
+	0x92, 0x2C, 0xD6, 0x0C, 0x46, 0x9D, 0x9B, 0xC2, 0x84, 0x52, 0x65, 0xF6, 0xBD, 0xFA, 0x0D, 0x74,
+	0x00, 0x66, 0x10, 0x07, 0xF1, 0x19, 0x06, 0x3A, 0x6C, 0xFF, 0xEE, 0xB2, 0x40, 0xE5, 0x88, 0x2B,
+	0x6C, 0xAB, 0x6B, 0x1D, 0x88, 0xB8, 0x44, 0x25, 0xF4, 0xEA, 0xB7, 0xEC, 0xBA, 0x12, 0x8A, 0x24
 };
 
 
+// CN V8
+const static uint8_t test_output_v2[160] = {
+	0x97, 0x37, 0x82, 0x82, 0xcf, 0x10, 0xe7, 0xad, 0x03, 0x3f, 0x7b, 0x80, 0x74, 0xc4, 0x0e, 0x14,
+	0xd0, 0x6e, 0x7f, 0x60, 0x9d, 0xdd, 0xda, 0x78, 0x76, 0x80, 0xb5, 0x8c, 0x05, 0xf4, 0x3d, 0x21,
+	0x87, 0x1f, 0xcd, 0x68, 0x23, 0xf6, 0xa8, 0x79, 0xbb, 0x3f, 0x33, 0x95, 0x1c, 0x8e, 0x8e, 0x89,
+	0x1d, 0x40, 0x43, 0x88, 0x0b, 0x02, 0xdf, 0xa1, 0xbb, 0x3b, 0xe4, 0x98, 0xb5, 0x0e, 0x75, 0x78,
+	0xe6, 0x0d, 0x24, 0x0f, 0x65, 0x85, 0x60, 0x3a, 0x4a, 0xe5, 0x5f, 0x54, 0x9b, 0xc8, 0x79, 0x93,
+	0xeb, 0x3d, 0x98, 0x2c, 0xfe, 0x9b, 0xfb, 0x15, 0xb6, 0x88, 0x21, 0x94, 0xb0, 0x05, 0x86, 0x5c,
+	0x59, 0x8b, 0x93, 0x7a, 0xda, 0xd2, 0xa2, 0x14, 0xed, 0xb7, 0xc4, 0x5d, 0xa1, 0xef, 0x26, 0xf3,
+	0xc7, 0x73, 0x29, 0x4d, 0xf1, 0xc8, 0x2c, 0xe0, 0xd0, 0xe9, 0xed, 0x0c, 0x70, 0x75, 0x05, 0x3e,
+	0x5b, 0xf6, 0xa0, 0x6e, 0xea, 0xde, 0x87, 0x0b, 0x06, 0x29, 0x03, 0xbf, 0xb4, 0x85, 0x9d, 0x04,
+	0x75, 0x1a, 0xcd, 0x1e, 0xd6, 0xaa, 0x1b, 0x05, 0x24, 0x6a, 0x2c, 0x80, 0x69, 0x68, 0xdc, 0x97
+};
+
 // CN XTL
 const static uint8_t test_output_xtl[160] = {
-		0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3,
-		0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1,
-		0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90,
-		0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF,
-		0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3,
-		0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86,
-		0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D,
-		0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF,
-		0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA,
-		0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48
+	0x8F, 0xE5, 0xF0, 0x5F, 0x02, 0x2A, 0x61, 0x7D, 0xE5, 0x3F, 0x79, 0x36, 0x4B, 0x25, 0xCB, 0xC3,
+	0xC0, 0x8E, 0x0E, 0x1F, 0xE3, 0xBE, 0x48, 0x57, 0x07, 0x03, 0xFE, 0xE1, 0xEC, 0x0E, 0xB0, 0xB1,
+	0x21, 0x26, 0xFF, 0x98, 0xE6, 0x86, 0x08, 0x5B, 0xC9, 0x96, 0x44, 0xA3, 0xB8, 0x4E, 0x28, 0x90,
+	0x76, 0xED, 0xAD, 0xB9, 0xAA, 0xAC, 0x01, 0x94, 0x1D, 0xBE, 0x3E, 0xEA, 0xAD, 0xEE, 0xB2, 0xCF,
+	0xB0, 0x43, 0x4B, 0x88, 0xFC, 0xB2, 0xF3, 0x82, 0x9D, 0xD7, 0xDF, 0x51, 0x97, 0x2C, 0x5A, 0xE3,
+	0xC7, 0x16, 0x0B, 0xC8, 0x7C, 0xB7, 0x2F, 0x1C, 0x55, 0x33, 0xCA, 0xE1, 0xEE, 0x08, 0xA4, 0x86,
+	0x60, 0xED, 0x6E, 0x9D, 0x2D, 0x05, 0x0D, 0x7D, 0x02, 0x49, 0x23, 0x39, 0x7C, 0xC3, 0x6D, 0x3D,
+	0x05, 0x51, 0x28, 0xF1, 0x9B, 0x3C, 0xDF, 0xC4, 0xEA, 0x8A, 0xA6, 0x6A, 0x3C, 0x8B, 0xE2, 0xAF,
+	0x47, 0x00, 0xFC, 0x36, 0xED, 0x50, 0xBB, 0xD2, 0x2E, 0x63, 0x4B, 0x93, 0x11, 0x0C, 0xA7, 0xBA,
+	0x32, 0x6E, 0x47, 0x4D, 0xCE, 0xCC, 0x82, 0x54, 0x1D, 0x06, 0xF8, 0x06, 0x86, 0xBD, 0x22, 0x48
 };
 
 
 const static uint8_t test_output_v0_lite[160] = {
-		0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
-		0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
-		0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
-		0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD,
-		0x38, 0x08, 0xE1, 0x17, 0x0B, 0x99, 0x8D, 0x1A, 0x3C, 0xCE, 0x35, 0xC5, 0xC7, 0x3A, 0x00, 0x2E,
-		0xCB, 0x54, 0xF0, 0x78, 0x2E, 0x9E, 0xDB, 0xC7, 0xDF, 0x2E, 0x71, 0x9A, 0x16, 0x97, 0xC4, 0x18,
-		0x4B, 0x97, 0x07, 0xFE, 0x5D, 0x98, 0x9A, 0xD6, 0xD8, 0xE5, 0x92, 0x66, 0x87, 0x7F, 0x19, 0x37,
-		0xA2, 0x5E, 0xE6, 0x96, 0xB5, 0x97, 0x33, 0x89, 0xE0, 0xA7, 0xC9, 0xDD, 0x4A, 0x7E, 0x9E, 0x53,
-		0xBE, 0x91, 0x2B, 0xF5, 0xF5, 0xAF, 0xDD, 0x09, 0xA2, 0xF4, 0xA4, 0x56, 0xEB, 0x96, 0x22, 0xC9,
-		0x94, 0xFB, 0x7B, 0x28, 0xC9, 0x97, 0x65, 0x04, 0xAC, 0x4F, 0x84, 0x71, 0xDA, 0x6E, 0xD8, 0xC5
+	0x36, 0x95, 0xB4, 0xB5, 0x3B, 0xB0, 0x03, 0x58, 0xB0, 0xAD, 0x38, 0xDC, 0x16, 0x0F, 0xEB, 0x9E,
+	0x00, 0x4E, 0xEC, 0xE0, 0x9B, 0x83, 0xA7, 0x2E, 0xF6, 0xBA, 0x98, 0x64, 0xD3, 0x51, 0x0C, 0x88,
+	0x28, 0xA2, 0x2B, 0xAD, 0x3F, 0x93, 0xD1, 0x40, 0x8F, 0xCA, 0x47, 0x2E, 0xB5, 0xAD, 0x1C, 0xBE,
+	0x75, 0xF2, 0x1D, 0x05, 0x3C, 0x8C, 0xE5, 0xB3, 0xAF, 0x10, 0x5A, 0x57, 0x71, 0x3E, 0x21, 0xDD,
+	0x38, 0x08, 0xE1, 0x17, 0x0B, 0x99, 0x8D, 0x1A, 0x3C, 0xCE, 0x35, 0xC5, 0xC7, 0x3A, 0x00, 0x2E,
+	0xCB, 0x54, 0xF0, 0x78, 0x2E, 0x9E, 0xDB, 0xC7, 0xDF, 0x2E, 0x71, 0x9A, 0x16, 0x97, 0xC4, 0x18,
+	0x4B, 0x97, 0x07, 0xFE, 0x5D, 0x98, 0x9A, 0xD6, 0xD8, 0xE5, 0x92, 0x66, 0x87, 0x7F, 0x19, 0x37,
+	0xA2, 0x5E, 0xE6, 0x96, 0xB5, 0x97, 0x33, 0x89, 0xE0, 0xA7, 0xC9, 0xDD, 0x4A, 0x7E, 0x9E, 0x53,
+	0xBE, 0x91, 0x2B, 0xF5, 0xF5, 0xAF, 0xDD, 0x09, 0xA2, 0xF4, 0xA4, 0x56, 0xEB, 0x96, 0x22, 0xC9,
+	0x94, 0xFB, 0x7B, 0x28, 0xC9, 0x97, 0x65, 0x04, 0xAC, 0x4F, 0x84, 0x71, 0xDA, 0x6E, 0xD8, 0xC5
 };
 
 
 // CN-Lite v7
 const static uint8_t test_output_v1_lite[160] = {
-		0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
-		0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
-		0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
-		0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F,
-		0x16, 0x08, 0x74, 0xC7, 0xA2, 0xD2, 0xA3, 0x97, 0x95, 0x76, 0xCA, 0x4D, 0x06, 0x39, 0x7A, 0xAB,
-		0x6C, 0x87, 0x58, 0x33, 0x4D, 0xC8, 0x5A, 0xAB, 0x04, 0x27, 0xFE, 0x8B, 0x1C, 0x23, 0x2F, 0x32,
-		0xC0, 0x44, 0xFF, 0x0D, 0xB5, 0x3B, 0x27, 0x96, 0x06, 0x89, 0x7B, 0xA3, 0x0B, 0xD0, 0xCE, 0x9E,
-		0x90, 0x22, 0x77, 0x5A, 0xAD, 0xA1, 0xE5, 0xB6, 0xFC, 0xCB, 0x39, 0x7E, 0x2B, 0x10, 0xEE, 0xB4,
-		0x8C, 0x2B, 0xA4, 0x1F, 0x60, 0x76, 0x39, 0xD7, 0xF6, 0x46, 0x77, 0x18, 0x20, 0xAD, 0xD4, 0xC9,
-		0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6
+	0x6D, 0x8C, 0xDC, 0x44, 0x4E, 0x9B, 0xBB, 0xFD, 0x68, 0xFC, 0x43, 0xFC, 0xD4, 0x85, 0x5B, 0x22,
+	0x8C, 0x8A, 0x1B, 0xD9, 0x1D, 0x9D, 0x00, 0x28, 0x5B, 0xEC, 0x02, 0xB7, 0xCA, 0x2D, 0x67, 0x41,
+	0x87, 0xC4, 0xE5, 0x70, 0x65, 0x3E, 0xB4, 0xC2, 0xB4, 0x2B, 0x7A, 0x0D, 0x54, 0x65, 0x59, 0x45,
+	0x2D, 0xFA, 0xB5, 0x73, 0xB8, 0x2E, 0xC5, 0x2F, 0x15, 0x2B, 0x7F, 0xF9, 0x8E, 0x79, 0x44, 0x6F,
+	0x16, 0x08, 0x74, 0xC7, 0xA2, 0xD2, 0xA3, 0x97, 0x95, 0x76, 0xCA, 0x4D, 0x06, 0x39, 0x7A, 0xAB,
+	0x6C, 0x87, 0x58, 0x33, 0x4D, 0xC8, 0x5A, 0xAB, 0x04, 0x27, 0xFE, 0x8B, 0x1C, 0x23, 0x2F, 0x32,
+	0xC0, 0x44, 0xFF, 0x0D, 0xB5, 0x3B, 0x27, 0x96, 0x06, 0x89, 0x7B, 0xA3, 0x0B, 0xD0, 0xCE, 0x9E,
+	0x90, 0x22, 0x77, 0x5A, 0xAD, 0xA1, 0xE5, 0xB6, 0xFC, 0xCB, 0x39, 0x7E, 0x2B, 0x10, 0xEE, 0xB4,
+	0x8C, 0x2B, 0xA4, 0x1F, 0x60, 0x76, 0x39, 0xD7, 0xF6, 0x46, 0x77, 0x18, 0x20, 0xAD, 0xD4, 0xC9,
+	0x87, 0xF7, 0x37, 0xDA, 0xFD, 0xBA, 0xBA, 0xD2, 0xF2, 0x68, 0xDC, 0x26, 0x8D, 0x1B, 0x08, 0xC6
 };
 
 
 // CN-Lite IPBC
 const static uint8_t test_output_ipbc_lite[160] = {
-		0xE4, 0x93, 0x8C, 0xAA, 0x59, 0x8D, 0x02, 0x8A, 0xB8, 0x6F, 0x25, 0xD2, 0xB1, 0x23, 0xD0, 0xD5,
-		0x33, 0xE3, 0x9F, 0x37, 0xAC, 0xE5, 0xF8, 0xEB, 0x7A, 0xE8, 0x40, 0xEB, 0x5D, 0xB1, 0x35, 0x5F,
-		0xB2, 0x47, 0x86, 0xF0, 0x7F, 0x6F, 0x4B, 0x55, 0x3E, 0xA1, 0xBB, 0xE8, 0xA1, 0x75, 0x00, 0x2D,
-		0x07, 0x9A, 0x21, 0x0E, 0xBD, 0x06, 0x6A, 0xB0, 0xFD, 0x96, 0x9E, 0xE6, 0xE4, 0x69, 0x67, 0xBB,
-		0x88, 0x45, 0x0B, 0x91, 0x0B, 0x7B, 0xCB, 0x21, 0x3C, 0x3C, 0x09, 0x30, 0x07, 0x71, 0x07, 0xD5,
-		0xB8, 0x2D, 0x83, 0x09, 0xAF, 0x7E, 0xB2, 0xA8, 0xAC, 0x25, 0xDC, 0x10, 0xF8, 0x63, 0x6A, 0xBC,
-		0x73, 0x01, 0x4E, 0xA8, 0x1C, 0xDA, 0x9A, 0x86, 0x17, 0xEC, 0xA8, 0xFB, 0xAA, 0x23, 0x23, 0x17,
-		0xE1, 0x32, 0x68, 0x9C, 0x4C, 0xF4, 0x08, 0xED, 0xB0, 0x15, 0xC3, 0xA9, 0x0F, 0xF0, 0xA2, 0x7E,
-		0xD9, 0xE4, 0x23, 0xA7, 0x9E, 0x91, 0xD8, 0x73, 0x94, 0xD6, 0x6C, 0x70, 0x9B, 0x8B, 0x72, 0x92,
-		0xA3, 0xA4, 0x0A, 0xE2, 0x3C, 0x0A, 0x34, 0x88, 0xA1, 0x6D, 0xFE, 0x02, 0x44, 0x60, 0x7B, 0x3D
+	0xE4, 0x93, 0x8C, 0xAA, 0x59, 0x8D, 0x02, 0x8A, 0xB8, 0x6F, 0x25, 0xD2, 0xB1, 0x23, 0xD0, 0xD5,
+	0x33, 0xE3, 0x9F, 0x37, 0xAC, 0xE5, 0xF8, 0xEB, 0x7A, 0xE8, 0x40, 0xEB, 0x5D, 0xB1, 0x35, 0x5F,
+	0xB2, 0x47, 0x86, 0xF0, 0x7F, 0x6F, 0x4B, 0x55, 0x3E, 0xA1, 0xBB, 0xE8, 0xA1, 0x75, 0x00, 0x2D,
+	0x07, 0x9A, 0x21, 0x0E, 0xBD, 0x06, 0x6A, 0xB0, 0xFD, 0x96, 0x9E, 0xE6, 0xE4, 0x69, 0x67, 0xBB,
+	0x88, 0x45, 0x0B, 0x91, 0x0B, 0x7B, 0xCB, 0x21, 0x3C, 0x3C, 0x09, 0x30, 0x07, 0x71, 0x07, 0xD5,
+	0xB8, 0x2D, 0x83, 0x09, 0xAF, 0x7E, 0xB2, 0xA8, 0xAC, 0x25, 0xDC, 0x10, 0xF8, 0x63, 0x6A, 0xBC,
+	0x73, 0x01, 0x4E, 0xA8, 0x1C, 0xDA, 0x9A, 0x86, 0x17, 0xEC, 0xA8, 0xFB, 0xAA, 0x23, 0x23, 0x17,
+	0xE1, 0x32, 0x68, 0x9C, 0x4C, 0xF4, 0x08, 0xED, 0xB0, 0x15, 0xC3, 0xA9, 0x0F, 0xF0, 0xA2, 0x7E,
+	0xD9, 0xE4, 0x23, 0xA7, 0x9E, 0x91, 0xD8, 0x73, 0x94, 0xD6, 0x6C, 0x70, 0x9B, 0x8B, 0x72, 0x92,
+	0xA3, 0xA4, 0x0A, 0xE2, 0x3C, 0x0A, 0x34, 0x88, 0xA1, 0x6D, 0xFE, 0x02, 0x44, 0x60, 0x7B, 0x3D
 };
 
 
 // CN-Heavy
 const static uint8_t test_output_heavy[160] = {
-		0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64,
-		0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2,
-		0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A,
-		0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D,
-		0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7,
-		0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD,
-		0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1,
-		0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9,
-		0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51,
-		0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB
+	0x99, 0x83, 0xF2, 0x1B, 0xDF, 0x20, 0x10, 0xA8, 0xD7, 0x07, 0xBB, 0x2F, 0x14, 0xD7, 0x86, 0x64,
+	0xBB, 0xE1, 0x18, 0x7F, 0x55, 0x01, 0x4B, 0x39, 0xE5, 0xF3, 0xD6, 0x93, 0x28, 0xE4, 0x8F, 0xC2,
+	0x4D, 0x94, 0x7D, 0xD6, 0xDB, 0x6E, 0x07, 0x48, 0x26, 0x4A, 0x51, 0x2E, 0xAC, 0xF3, 0x25, 0x4A,
+	0x1F, 0x1A, 0xA2, 0x5B, 0xFC, 0x0A, 0xAD, 0x82, 0xDE, 0xA8, 0x99, 0x96, 0x88, 0x52, 0xD2, 0x7D,
+	0x3E, 0xE1, 0x23, 0x03, 0x5A, 0x63, 0x7B, 0x66, 0xF6, 0xD7, 0xC2, 0x2A, 0x34, 0x5E, 0x88, 0xE7,
+	0xFA, 0xC4, 0x25, 0x36, 0x54, 0xCB, 0xD2, 0x5C, 0x2F, 0x80, 0x2A, 0xF9, 0xCC, 0x43, 0xF7, 0xCD,
+	0xE5, 0x18, 0xA8, 0x05, 0x60, 0x18, 0xA5, 0x73, 0x72, 0x9B, 0x32, 0xDC, 0x69, 0x83, 0xC1, 0xE1,
+	0x1F, 0xDB, 0xDA, 0x6B, 0xAC, 0xEC, 0x9F, 0x67, 0xF8, 0x27, 0x1D, 0xC7, 0xE6, 0x46, 0x42, 0xF9,
+	0x53, 0x62, 0x0A, 0x54, 0x7D, 0x43, 0xEA, 0x18, 0x94, 0xED, 0xD8, 0x92, 0x06, 0x6A, 0xA1, 0x51,
+	0xAD, 0xB1, 0xFD, 0x89, 0xFB, 0x5C, 0xB4, 0x25, 0x6A, 0xDD, 0xB0, 0x09, 0xC5, 0x72, 0x87, 0xEB
 };
 
 // CN-Heavy Haven
 const static uint8_t test_output_heavy_haven[96] = {
-		0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57,
-		0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6,
-		0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F,
-		0xDE, 0x18, 0x29, 0x8E, 0xBB, 0x34, 0x2B, 0xEF, 0x7A, 0x04, 0x22, 0xD1, 0xB1, 0xF2, 0x48, 0xDA,
-		0xE3, 0x7F, 0x4B, 0x4C, 0xB4, 0xDF, 0xE8, 0xD3, 0x70, 0xE2, 0xE7, 0x44, 0x25, 0x87, 0x12, 0xF9,
-		0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B
+	0x5A, 0xC3, 0xF7, 0x85, 0xC4, 0x90, 0xC5, 0x85, 0x50, 0xEC, 0x95, 0xD2, 0x72, 0x65, 0x63, 0x57,
+	0x7E, 0x7C, 0x1C, 0x21, 0x2D, 0x0C, 0xDE, 0x59, 0x12, 0x73, 0x20, 0x1E, 0x44, 0xFD, 0xD5, 0xB6,
+	0x1F, 0x4E, 0xB2, 0x0A, 0x36, 0x51, 0x4B, 0xF5, 0x4D, 0xC9, 0xE0, 0x90, 0x2C, 0x16, 0x47, 0x3F,
+	0xDE, 0x18, 0x29, 0x8E, 0xBB, 0x34, 0x2B, 0xEF, 0x7A, 0x04, 0x22, 0xD1, 0xB1, 0xF2, 0x48, 0xDA,
+	0xE3, 0x7F, 0x4B, 0x4C, 0xB4, 0xDF, 0xE8, 0xD3, 0x70, 0xE2, 0xE7, 0x44, 0x25, 0x87, 0x12, 0xF9,
+	0x8F, 0x28, 0x0B, 0xCE, 0x2C, 0xEE, 0xDD, 0x88, 0x94, 0x35, 0x48, 0x51, 0xAE, 0xC8, 0x9C, 0x0B
 };
 
 // CN-Heavy Tube
 const static uint8_t test_output_heavy_tube[96] = {
-        0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf,
-        0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35,
-        0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3,
-        0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74,
-        0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f,
-        0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb
+	0xfe, 0x53, 0x35, 0x20, 0x76, 0xea, 0xe6, 0x89, 0xfa, 0x3b, 0x4f, 0xda, 0x61, 0x46, 0x34, 0xcf,
+	0xc3, 0x12, 0xee, 0x0c, 0x38, 0x7d, 0xf2, 0xb8, 0xb7, 0x4d, 0xa2, 0xa1, 0x59, 0x74, 0x12, 0x35,
+	0xcd, 0x3f, 0x29, 0xdf, 0x07, 0x4a, 0x14, 0xad, 0x0b, 0x98, 0x99, 0x37, 0xca, 0x14, 0x68, 0xa3,
+	0x8d, 0xae, 0x86, 0xc1, 0xa3, 0x54, 0x05, 0xbe, 0xea, 0x6d, 0x29, 0x24, 0x0c, 0x82, 0x97, 0x74,
+	0xa0, 0x64, 0x77, 0xcd, 0x8d, 0x8a, 0xc3, 0x10, 0xb4, 0x89, 0x0e, 0xbb, 0x7d, 0xe6, 0x32, 0x8f,
+	0xf4, 0x2d, 0xb6, 0x9e, 0x8a, 0xf9, 0xf8, 0xee, 0x2c, 0xd0, 0x74, 0xed, 0xa9, 0xaa, 0xa1, 0xfb
 };
 
 #endif /* __CRYPTONIGHT_TEST_H__ */
diff --git a/src/crypto/CryptoNight_x86.h b/src/crypto/CryptoNight_x86.h
index 6ac2098d..06b5150c 100644
--- a/src/crypto/CryptoNight_x86.h
+++ b/src/crypto/CryptoNight_x86.h
@@ -29,6 +29,8 @@
 
 #ifdef __GNUC__
 #   include <x86intrin.h>
+#include <cfenv>
+
 #else
 #   include <intrin.h>
 #   define __restrict__ __restrict
@@ -37,7 +39,7 @@
 
 #include "crypto/CryptoNight.h"
 #include "crypto/soft_aes.h"
-
+#include "AsmOptimization.h"
 
 extern "C"
 {
@@ -46,42 +48,35 @@ extern "C"
 #include "crypto/c_blake256.h"
 #include "crypto/c_jh.h"
 #include "crypto/c_skein.h"
+
+#ifndef XMRIG_NO_ASM
+    void cnv1_mainloop_sandybridge_asm(ScratchPad* ctx0);
+    void cnv2_mainloop_ivybridge_asm(ScratchPad* ctx0);
+    void cnv2_mainloop_ryzen_asm(ScratchPad* ctx0);
+    void cnv2_double_mainloop_sandybridge_asm(ScratchPad* ctx0, ScratchPad* ctx1);
+    void cnv1_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
+    void cnv2_mainloop_soft_aes_sandybridge_asm(ScratchPad* ctx0);
+#endif
 }
 
-static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) {
-    blake256_hash(output, input, len);
-}
-
-
-static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) {
-    groestl(input, len * 8, output);
-}
-
-
-static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) {
-    jh_hash(32 * 8, input, 8 * len, output);
-}
-
-
-static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) {
-    xmr_skein(input, output);
-}
-
-
-void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+#ifdef __GNUC__
+#define LIKELY(X) __builtin_expect(X, 1)
+#define UNLIKELY(X) __builtin_expect(X, 0)
+#else
+#define LIKELY(X) X
+#define UNLIKELY(X) X
+#endif
 
 #if defined(__x86_64__) || defined(_M_AMD64)
 #   define EXTRACT64(X) _mm_cvtsi128_si64(X)
 
 #   ifdef __GNUC__
-
 static inline uint64_t __umul128(uint64_t a, uint64_t b, uint64_t* hi)
 {
     unsigned __int128 r = (unsigned __int128) a * (unsigned __int128) b;
     *hi = r >> 64;
     return (uint64_t) r;
 }
-
 #   else
 #define __umul128 _umul128
 #   endif
@@ -120,6 +115,71 @@ static inline uint64_t __umul128(uint64_t multiplier, uint64_t multiplicand, uin
 }
 #endif
 
+#ifdef _MSC_VER
+#else
+#endif
+
+#ifdef _MSC_VER
+#   define SET_ROUNDING_MODE_UP() _control87(RC_UP, MCW_RC);
+#else
+#   define SET_ROUNDING_MODE_UP() std::fesetround(FE_UPWARD);
+#endif
+
+#   define SHUFFLE_PHASE_1(l, idx, bx0, bx1, ax) \
+{ \
+   const __m128i chunk1 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))); \
+   const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \
+   const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \
+   _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \
+   _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \
+   _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \
+}
+
+#   define INTEGER_MATH_V2(idx, cl, cx) \
+{ \
+    const uint64_t cx_ = _mm_cvtsi128_si64(cx); \
+    cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm##idx)) ^ (sqrt_result##idx << 32); \
+    const uint32_t d_ = (cx_ + (sqrt_result##idx << 1)) | 0x80000001UL; \
+    const uint64_t cx1_ = _mm_cvtsi128_si64(_mm_srli_si128(cx, 8)); \
+    const uint64_t division_result = static_cast<uint32_t>(cx1_ / d_) + ((cx1_ % d_) << 32); \
+    division_result_xmm##idx = _mm_cvtsi64_si128(static_cast<int64_t>(division_result)); \
+    sqrt_result##idx = int_sqrt_v2(cx_ + division_result); \
+}
+
+#   define SHUFFLE_PHASE_2(l, idx, bx0, bx1, ax, lo, hi) \
+{ \
+    const __m128i chunk1 = _mm_xor_si128(_mm_load_si128((__m128i *)((l) + ((idx) ^ 0x10))), _mm_set_epi64x(lo, hi)); \
+    const __m128i chunk2 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x20))); \
+    const __m128i chunk3 = _mm_load_si128((__m128i *)((l) + ((idx) ^ 0x30))); \
+    hi ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[0]; \
+    lo ^= ((uint64_t*)((l) + ((idx) ^ 0x20)))[1]; \
+    _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x10)), _mm_add_epi64(chunk3, bx1)); \
+    _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x20)), _mm_add_epi64(chunk1, bx0)); \
+    _mm_store_si128((__m128i *)((l) + ((idx) ^ 0x30)), _mm_add_epi64(chunk2, ax)); \
+}
+
+static inline void do_blake_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    blake256_hash(output, input, len);
+}
+
+
+static inline void do_groestl_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    groestl(input, len * 8, output);
+}
+
+
+static inline void do_jh_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    jh_hash(32 * 8, input, 8 * len, output);
+}
+
+
+static inline void do_skein_hash(const uint8_t *input, size_t len, uint8_t *output) {
+    xmr_skein(input, output);
+}
+
+
+void (* const extra_hashes[4])(const uint8_t *, size_t, uint8_t *) = {do_blake_hash, do_groestl_hash, do_jh_hash, do_skein_hash};
+
 
 // This will shift and xor tmp1 into itself as 4 32-bit vals such as
 // sl_xor(a1 a2 a3 a4) = a1 (a2^a1) (a3^a2^a1) (a4^a3^a2^a1)
@@ -467,6 +527,37 @@ static inline void cn_implode_scratchpad_heavy(const __m128i* input, __m128i* ou
     _mm_store_si128(output + 11, xout7);
 }
 
+static inline void int_sqrt_v2_fixup(uint64_t& r, uint64_t n0)
+{
+    if (LIKELY(r & 524287))
+    {
+        r >>= 19;
+        return;
+    }
+
+    --r;
+    const uint64_t s = r >> 20;
+    r >>= 19;
+
+    uint64_t x2 = (s - (1022ULL << 32)) * (r - s - (1022ULL << 32) + 1);
+#if (defined(_MSC_VER) || __GNUC__ > 7 || (__GNUC__ == 7 && __GNUC_MINOR__ > 1)) && (defined(__x86_64__) || defined(_M_AMD64))
+    _addcarry_u64(_subborrow_u64(0, x2, n0, (unsigned long long int*)&x2), r, 0, (unsigned long long int*)&r);
+#else
+    // GCC versions prior to 7 don't generate correct assembly for _subborrow_u64 -> _addcarry_u64 sequence
+	// Fallback to simpler code
+	if (x2 < n0) ++r;
+#endif
+}
+
+static inline uint64_t int_sqrt_v2(uint64_t n0)
+{
+__m128d x = _mm_castsi128_pd(_mm_add_epi64(_mm_cvtsi64_si128(n0 >> 12), _mm_set_epi64x(0, 1023ULL << 52)));
+x = _mm_sqrt_sd(_mm_setzero_pd(), x);
+uint64_t r = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+int_sqrt_v2_fixup(r, n0);
+return r;
+}
+
 // n-Loop version. Seems to be little bit slower then the hardcoded one.
 template<size_t ITERATIONS, size_t INDEX_SHIFT, size_t MEM, size_t MASK, bool SOFT_AES, size_t NUM_HASH_BLOCKS>
 class CryptoNightMultiHash
@@ -481,8 +572,10 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
@@ -502,22 +595,27 @@ public:
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
                 } else {
-                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
-                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
                 }
+            }
 
-                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
-                                _mm_xor_si128(bx[hashBlock], cx));
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK],
+                                _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
-                uint64_t hi, lo, cl, ch;
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
                 ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
                 lo = __umul128(idx[hashBlock], cl, &hi);
@@ -531,6 +629,8 @@ public:
                 ah[hashBlock] ^= ch;
                 al[hashBlock] ^= cl;
                 idx[hashBlock] = al[hashBlock];
+
+                bx[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -551,9 +651,11 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
@@ -575,26 +677,33 @@ public:
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
                 } else {
-                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
-                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
                 }
+            }
 
-                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx));
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                const uint8_t tmp = reinterpret_cast<const uint8_t *>(&l[hashBlock][idx[hashBlock] & MASK])[11];
                 static const uint32_t table = 0x75310;
                 const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-                ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+                ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
-                uint64_t hi, lo, cl, ch;
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
                 ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
                 lo = __umul128(idx[hashBlock], cl, &hi);
@@ -612,6 +721,8 @@ public:
                 ah[hashBlock] ^= ch;
                 al[hashBlock] ^= cl;
                 idx[hashBlock] = al[hashBlock];
+
+                bx[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -623,6 +734,133 @@ public:
         }
     }
 
+    inline static void hashPowV2_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
+
+
+    // multi
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
+    {
+        const uint8_t* l[NUM_HASH_BLOCKS];
+        uint64_t* h[NUM_HASH_BLOCKS];
+        uint64_t al[NUM_HASH_BLOCKS];
+        uint64_t ah[NUM_HASH_BLOCKS];
+        uint64_t idx[NUM_HASH_BLOCKS];
+        uint64_t sqrt_result[NUM_HASH_BLOCKS];
+        __m128i bx0[NUM_HASH_BLOCKS];
+        __m128i bx1[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
+        __m128i division_result_xmm[NUM_HASH_BLOCKS];
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
+        }
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            l[hashBlock] = scratchPad[hashBlock]->memory;
+            h[hashBlock] = reinterpret_cast<uint64_t*>(scratchPad[hashBlock]->state);
+
+            cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h[hashBlock], (__m128i*) l[hashBlock]);
+
+            al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+            ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5];
+            bx0[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
+            bx1[hashBlock] = _mm_set_epi64x(h[hashBlock][9] ^ h[hashBlock][11], h[hashBlock][8] ^ h[hashBlock][10]);
+            idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
+
+            division_result_xmm[hashBlock] = _mm_cvtsi64_si128(h[hashBlock][12]);
+            sqrt_result[hashBlock] = h[hashBlock][13];
+        }
+
+        SET_ROUNDING_MODE_UP();
+
+        uint64_t sqrt_result0;
+        __m128i division_result_xmm0;
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
+
+                if (SOFT_AES) {
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
+                } else {
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
+                }
+            }
+
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                SHUFFLE_PHASE_1(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock])
+            }
+
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK],
+                    _mm_xor_si128(bx0[hashBlock], cx[hashBlock]));
+            }
+
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
+
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                cl = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[0];
+                ch = ((uint64_t *) &l[hashBlock][idx[hashBlock] & MASK])[1];
+
+                sqrt_result0 = sqrt_result[hashBlock];
+                division_result_xmm0 = division_result_xmm[hashBlock];
+
+                INTEGER_MATH_V2(0, cl, cx[hashBlock])
+
+                sqrt_result[hashBlock] = sqrt_result0;
+                division_result_xmm[hashBlock] = division_result_xmm0;
+
+                lo = __umul128(idx[hashBlock], cl, &hi);
+
+                SHUFFLE_PHASE_2(l[hashBlock], idx[hashBlock] & MASK, bx0[hashBlock], bx1[hashBlock], ax[hashBlock], lo, hi)
+
+                al[hashBlock] += hi;
+                ah[hashBlock] += lo;
+
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0] = al[hashBlock];
+                ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1] = ah[hashBlock];
+
+                ah[hashBlock] ^= ch;
+                al[hashBlock] ^= cl;
+                idx[hashBlock] = al[hashBlock];
+
+                bx1[hashBlock] = bx0[hashBlock];
+                bx0[hashBlock] = cx[hashBlock];
+            }
+        }
+
+        for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+            cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
+            keccakf(h[hashBlock], 24);
+            extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
+                                                              output + hashBlock * 32);
+        }
+    }
+
+    inline static void hashPowV3_asm(const uint8_t* __restrict__ input,
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad,
+                                 AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                     size_t size,
                                     uint8_t* __restrict__ output,
@@ -632,9 +870,11 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
@@ -650,34 +890,40 @@ public:
 
             al[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
             ah[hashBlock] = h[hashBlock][1] ^ h[hashBlock][5];
-            bx[hashBlock] =
-                    _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
+            bx[hashBlock] = _mm_set_epi64x(h[hashBlock][3] ^ h[hashBlock][7], h[hashBlock][2] ^ h[hashBlock][6]);
             idx[hashBlock] = h[hashBlock][0] ^ h[hashBlock][4];
         }
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
                 } else {
-                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
-                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
                 }
+            }
 
-                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
-                                _mm_xor_si128(bx[hashBlock], cx));
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK],
+                                _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                const uint8_t tmp = reinterpret_cast<const uint8_t *>(&l[hashBlock][idx[hashBlock] & MASK])[11];
                 static const uint32_t table = 0x75310;
                 const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-                ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+                ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
-                uint64_t hi, lo, cl, ch;
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
                 ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
                 lo = __umul128(idx[hashBlock], cl, &hi);
@@ -697,6 +943,8 @@ public:
                 ah[hashBlock] ^= ch;
                 al[hashBlock] ^= cl;
                 idx[hashBlock] = al[hashBlock];
+
+                bx[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -717,8 +965,10 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
@@ -738,22 +988,27 @@ public:
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
                 } else {
-                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
-                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
                 }
+            }
 
-                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
-                                _mm_xor_si128(bx[hashBlock], cx));
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK],
+                                _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
-                uint64_t hi, lo, cl, ch;
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
                 ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
                 lo = __umul128(idx[hashBlock], cl, &hi);
@@ -774,6 +1029,8 @@ public:
 
                 ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
                 idx[hashBlock] = d ^ q;
+
+                bx[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -781,7 +1038,7 @@ public:
             cn_implode_scratchpad_heavy<MEM, SOFT_AES>((__m128i*) l[hashBlock], (__m128i*) h[hashBlock]);
             keccakf(h[hashBlock], 24);
             extra_hashes[scratchPad[hashBlock]->state[0] & 3](scratchPad[hashBlock]->state, 200,
-                                                       output + hashBlock * 32);
+                                                              output + hashBlock * 32);
         }
     }
 
@@ -794,8 +1051,10 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
+        __m128i ax[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
@@ -815,22 +1074,27 @@ public:
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                ax[hashBlock] = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
                 if (SOFT_AES) {
-                    cx = soft_aesenc((uint32_t*) &l[hashBlock][idx[hashBlock] & MASK], _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = soft_aesenc((uint32_t *) &l[hashBlock][idx[hashBlock] & MASK], ax[hashBlock]);
                 } else {
-                    cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
-                    cx = _mm_aesenc_si128(cx, _mm_set_epi64x(ah[hashBlock], al[hashBlock]));
+                    cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+                    cx[hashBlock] = _mm_aesenc_si128(cx[hashBlock], ax[hashBlock]);
                 }
+            }
 
-                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK],
-                                _mm_xor_si128(bx[hashBlock], cx));
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK],
+                                _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
-                uint64_t hi, lo, cl, ch;
+            uint64_t hi, lo, cl, ch;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
                 ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
                 lo = __umul128(idx[hashBlock], cl, &hi);
@@ -851,6 +1115,8 @@ public:
 
                 ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
                 idx[hashBlock] = (~d) ^ q;
+
+                bx[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -871,9 +1137,10 @@ public:
         uint64_t* h[NUM_HASH_BLOCKS];
         uint64_t al[NUM_HASH_BLOCKS];
         uint64_t ah[NUM_HASH_BLOCKS];
-        __m128i bx[NUM_HASH_BLOCKS];
         uint64_t idx[NUM_HASH_BLOCKS];
         uint64_t tweak1_2[NUM_HASH_BLOCKS];
+        __m128i bx[NUM_HASH_BLOCKS];
+        __m128i cx[NUM_HASH_BLOCKS];
 
         for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
             keccak(static_cast<const uint8_t*>(input) + hashBlock * size, (int) size, scratchPad[hashBlock]->state, 200);
@@ -903,36 +1170,48 @@ public:
 
         for (size_t i = 0; i < ITERATIONS; i++) {
             for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
-                __m128i cx;
+                cx[hashBlock] = _mm_load_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK]);
+            }
 
-                cx = _mm_load_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK]);
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                const __m128i &key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
 
-                const __m128i& key = _mm_set_epi64x(ah[hashBlock], al[hashBlock]);
+                _mm_store_si128((__m128i *) k, key);
+                cx[hashBlock] = _mm_xor_si128(cx[hashBlock], _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
+                _mm_store_si128((__m128i *) x, cx[hashBlock]);
 
-                _mm_store_si128((__m128i*)k, key);
-                cx = _mm_xor_si128(cx, _mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128()));
-                _mm_store_si128((__m128i*)x, cx);
-
-                k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^ saes_table[3][BYTE(x[3], 3)];
+                k[0] ^= saes_table[0][BYTE(x[0], 0)] ^ saes_table[1][BYTE(x[1], 1)] ^ saes_table[2][BYTE(x[2], 2)] ^
+                        saes_table[3][BYTE(x[3], 3)];
                 x[0] ^= k[0];
-                k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^ saes_table[3][BYTE(x[0], 3)];
+                k[1] ^= saes_table[0][BYTE(x[1], 0)] ^ saes_table[1][BYTE(x[2], 1)] ^ saes_table[2][BYTE(x[3], 2)] ^
+                        saes_table[3][BYTE(x[0], 3)];
                 x[1] ^= k[1];
-                k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^ saes_table[3][BYTE(x[1], 3)];
+                k[2] ^= saes_table[0][BYTE(x[2], 0)] ^ saes_table[1][BYTE(x[3], 1)] ^ saes_table[2][BYTE(x[0], 2)] ^
+                        saes_table[3][BYTE(x[1], 3)];
                 x[2] ^= k[2];
-                k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^ saes_table[3][BYTE(x[2], 3)];
+                k[3] ^= saes_table[0][BYTE(x[3], 0)] ^ saes_table[1][BYTE(x[0], 1)] ^ saes_table[2][BYTE(x[1], 2)] ^
+                        saes_table[3][BYTE(x[2], 3)];
 
-                cx = _mm_load_si128((__m128i*)k);
+                cx[hashBlock] = _mm_load_si128((__m128i *) k);
+            }
 
-                _mm_store_si128((__m128i*) &l[hashBlock][idx[hashBlock] & MASK], _mm_xor_si128(bx[hashBlock], cx));
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                _mm_store_si128((__m128i *) &l[hashBlock][idx[hashBlock] & MASK],
+                                _mm_xor_si128(bx[hashBlock], cx[hashBlock]));
+            }
 
-                const uint8_t tmp = reinterpret_cast<const uint8_t*>(&l[hashBlock][idx[hashBlock] & MASK])[11];
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                const uint8_t tmp = reinterpret_cast<const uint8_t *>(&l[hashBlock][idx[hashBlock] & MASK])[11];
                 static const uint32_t table = 0x75310;
                 const uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-                ((uint8_t*)(&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+                ((uint8_t *) (&l[hashBlock][idx[hashBlock] & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            }
 
-                idx[hashBlock] = EXTRACT64(cx);
-                bx[hashBlock] = cx;
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
+                idx[hashBlock] = EXTRACT64(cx[hashBlock]);
+            }
 
+            for (size_t hashBlock = 0; hashBlock < NUM_HASH_BLOCKS; ++hashBlock) {
                 uint64_t hi, lo, cl, ch;
                 cl = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[0];
                 ch = ((uint64_t*) &l[hashBlock][idx[hashBlock] & MASK])[1];
@@ -960,6 +1239,8 @@ public:
 
                 ((int64_t*)&l[hashBlock][idx[hashBlock] & MASK])[0] = n ^ q;
                 idx[hashBlock] = d ^ q;
+
+                bx[hashBlock] = cx[hashBlock];
             }
         }
 
@@ -1105,6 +1386,142 @@ public:
         extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
     }
 
+    inline static void hashPowV2_asm(const uint8_t* __restrict__ input,
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad,
+                                 AsmOptimization asmOptimization)
+    {
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
+
+        const uint8_t*l = scratchPad[0]->memory;
+        uint64_t* h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+
+#ifndef XMRIG_NO_ASM
+        if (SOFT_AES) {
+            scratchPad[0]->input = input;
+            scratchPad[0]->variant1_table = variant1_table;
+            scratchPad[0]->t_fn = (const uint32_t*)saes_table;
+            cnv1_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
+        } else {
+            scratchPad[0]->input = input;
+            scratchPad[0]->variant1_table = variant1_table;
+            cnv1_mainloop_sandybridge_asm(scratchPad[0]);
+        }
+#endif
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        keccakf(h, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+    }
+
+    // single
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
+    {
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
+
+        const uint8_t*l = scratchPad[0]->memory;
+        uint64_t* h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+
+        uint64_t al = h[0] ^ h[4];
+        uint64_t ah = h[1] ^ h[5];
+        __m128i bx0 = _mm_set_epi64x(h[3] ^ h[7], h[2] ^ h[6]);
+        __m128i bx1 = _mm_set_epi64x(h[9] ^ h[11], h[8] ^ h[10]);
+
+        uint64_t idx = h[0] ^ h[4];
+
+        __m128i division_result_xmm0 = _mm_cvtsi64_si128(h[12]);
+        uint64_t sqrt_result0 =  h[13];
+
+        SET_ROUNDING_MODE_UP();
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx;
+
+            const __m128i ax = _mm_set_epi64x(ah, al);
+
+            if (SOFT_AES) {
+                cx = soft_aesenc((uint32_t*)&l[idx & MASK], ax);
+            } else {
+                cx = _mm_load_si128((__m128i*) &l[idx & MASK]);
+                cx = _mm_aesenc_si128(cx, ax);
+            }
+
+            SHUFFLE_PHASE_1(l, (idx&MASK), bx0, bx1, ax)
+
+            _mm_store_si128((__m128i*) &l[idx & MASK], _mm_xor_si128(bx0, cx));
+
+            idx = EXTRACT64(cx);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l[idx & MASK])[0];
+            ch = ((uint64_t*) &l[idx & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx)
+
+            lo = __umul128(idx, cl, &hi);
+
+            SHUFFLE_PHASE_2(l, (idx&MASK), bx0, bx1, ax, lo, hi)
+
+            al += hi;        // two fence statements are overhead
+            ah += lo;
+
+            ((uint64_t*) &l[idx & MASK])[0] = al;
+            ((uint64_t*) &l[idx & MASK])[1] = ah;
+
+            ah ^= ch;
+            al ^= cl;
+            idx = al;
+
+            bx1 = bx0;
+            bx0 = cx;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        keccakf(h, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+    }
+
+
+    // single asm
+    inline static void hashPowV3_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        const uint8_t* l = scratchPad[0]->memory;
+        uint64_t* h = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+
+        keccak(static_cast<const uint8_t*>(input), (int) size, scratchPad[0]->state, 200);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h, (__m128i*) l);
+
+#ifndef XMRIG_NO_ASM
+        if (asmOptimization == AsmOptimization::ASM_INTEL) {
+            if (SOFT_AES) {
+                scratchPad[0]->input = input;
+                scratchPad[0]->t_fn = (const uint32_t*)saes_table;
+                cnv2_mainloop_soft_aes_sandybridge_asm(scratchPad[0]);
+            } else {
+                cnv2_mainloop_ivybridge_asm(scratchPad[0]);
+            }
+        } else if (asmOptimization == AsmOptimization::ASM_RYZEN) {
+            cnv2_mainloop_ryzen_asm(scratchPad[0]);
+        }
+#endif
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l, (__m128i*) h);
+        keccakf(h, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
@@ -1605,6 +2022,209 @@ public:
         extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
     }
 
+    inline static void hashPowV2_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
+
+    // double
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+
+        __m128i division_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[12]), _mm_cvtsi64_si128(h1[12]));
+        __m128i sqrt_result_xmm = _mm_unpacklo_epi64(_mm_cvtsi64_si128(h0[13]), _mm_cvtsi64_si128(h1[13]));
+
+        SET_ROUNDING_MODE_UP()
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, ax0);
+                cx1 = _mm_aesenc_si128(cx1, ax1);
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0 & MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1 & MASK), bx01, bx11, ax1)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            const uint64_t sqrt_result0 = _mm_cvtsi128_si64(sqrt_result_xmm);
+            cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(division_result_xmm)) ^ (sqrt_result0 << 32);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0 & MASK), bx00, bx10, ax0, lo, hi)
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+            const uint64_t sqrt_result1 = _mm_cvtsi128_si64(_mm_srli_si128(sqrt_result_xmm, 8));
+            cl ^= static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_srli_si128(division_result_xmm, 8))) ^ (sqrt_result1 << 32);
+
+            const __m128i sqrt_result2 = _mm_add_epi64(_mm_slli_epi64(sqrt_result_xmm, 1), _mm_unpacklo_epi64(cx0, cx1));
+            const uint32_t d0 = _mm_cvtsi128_si64(sqrt_result2) | 0x80000001UL;
+            const uint32_t d1 = _mm_cvtsi128_si64(_mm_srli_si128(sqrt_result2, 8)) | 0x80000001UL;
+
+            const uint64_t cx01 = _mm_cvtsi128_si64(_mm_srli_si128(cx0, 8));
+            const uint64_t cx11 = _mm_cvtsi128_si64(_mm_srli_si128(cx1, 8));
+            __m128d x = _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), (cx01 + 1) >> 1), _mm_cvtsi64_sd(_mm_setzero_pd(), (cx11 + 1) >> 1));
+            __m128d y = _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), d0), _mm_cvtsi64_sd(_mm_setzero_pd(), d1));
+
+            __m128d result = _mm_div_pd(x, y);
+            result = _mm_castsi128_pd(_mm_add_epi64(_mm_castpd_si128(result), _mm_set_epi64x(1ULL << 52, 1ULL << 52)));
+
+            uint64_t q0 = _mm_cvttsd_si64(result);
+            uint64_t q1 = _mm_cvttsd_si64(_mm_castsi128_pd(_mm_srli_si128(_mm_castpd_si128(result), 8)));
+
+            uint64_t r0 = cx01 - d0 * q0;
+            if (UNLIKELY(int64_t(r0) < 0))
+            {
+                --q0;
+                r0 += d0;
+            }
+            uint64_t r1 = cx11 - d1 * q1;
+            if (UNLIKELY(int64_t(r1) < 0))
+            {
+                --q1;
+                r1 += d1;
+            }
+
+            division_result_xmm = _mm_set_epi32(r1, q1, r0, q0);
+
+            __m128i sqrt_input = _mm_add_epi64(_mm_unpacklo_epi64(cx0, cx1), division_result_xmm);
+            x = _mm_castsi128_pd(_mm_add_epi64(_mm_srli_epi64(sqrt_input, 12), _mm_set_epi64x(1023ULL << 52, 1023ULL << 52)));
+
+            x = _mm_sqrt_pd(x);
+
+            r0 = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_castpd_si128(x)));
+            int_sqrt_v2_fixup(r0, _mm_cvtsi128_si64(sqrt_input));
+            r1 = static_cast<uint64_t>(_mm_cvtsi128_si64(_mm_srli_si128(_mm_castpd_si128(x), 8)));
+            int_sqrt_v2_fixup(r1, _mm_cvtsi128_si64(_mm_srli_si128(sqrt_input, 8)));
+            sqrt_result_xmm = _mm_set_epi64x(r1, r0);
+
+            lo = __umul128(idx1, cl, &hi);
+
+            SHUFFLE_PHASE_2(l1, (idx1 & MASK), bx01, bx11, ax1, lo, hi)
+
+            al1 += hi;
+            ah1 += lo;
+
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            bx11 = bx01;
+            bx01 = cx1;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+    }
+
+    // double asm
+    inline static void hashPowV3_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+
+#ifndef XMRIG_NO_ASM
+        cnv2_double_mainloop_sandybridge_asm(scratchPad[0], scratchPad[1]);
+#endif
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+    }
+
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
@@ -2216,155 +2836,341 @@ public:
         extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
     }
 
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
-                          size_t size,
-                          uint8_t* __restrict__ output,
-                          ScratchPad** __restrict__ scratchPad)
-  {
-      keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
-      keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
-      keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
 
-      uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
-      uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
-      uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
 
-      const uint8_t* l0 = scratchPad[0]->memory;
-      const uint8_t* l1 = scratchPad[1]->memory;
-      const uint8_t* l2 = scratchPad[2]->memory;
-      uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
-      uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
-      uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
 
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
 
-      uint64_t al0 = h0[0] ^h0[4];
-      uint64_t al1 = h1[0] ^h1[4];
-      uint64_t al2 = h2[0] ^h2[4];
-      uint64_t ah0 = h0[1] ^h0[5];
-      uint64_t ah1 = h1[1] ^h1[5];
-      uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
 
-      __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-      __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-      __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
 
-      uint64_t idx0 = h0[0] ^h0[4];
-      uint64_t idx1 = h1[0] ^h1[4];
-      uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
 
-      for (size_t i = 0; i < ITERATIONS; i++) {
-          __m128i cx0;
-          __m128i cx1;
-          __m128i cx2;
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
 
-          if (SOFT_AES) {
-              cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-              cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-              cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-          } else {
-              cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
-              cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
-              cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
 
-              cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
-              cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
-              cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
-          }
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+            }
 
-          _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
-          _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
-          _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
 
-          static const uint32_t table = 0x75310;
-          uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
-          uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
 
-          idx0 = EXTRACT64(cx0);
-          idx1 = EXTRACT64(cx1);
-          idx2 = EXTRACT64(cx2);
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
 
-          bx0 = cx0;
-          bx1 = cx1;
-          bx2 = cx2;
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
 
 
-          uint64_t hi, lo, cl, ch;
-          cl = ((uint64_t*) &l0[idx0 & MASK])[0];
-          ch = ((uint64_t*) &l0[idx0 & MASK])[1];
-          lo = __umul128(idx0, cl, &hi);
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
 
-          al0 += hi;
-          ah0 += lo;
+            al0 += hi;
+            ah0 += lo;
 
-          ah0 ^= tweak1_2_0;
-          ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
-          ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
-          ah0 ^= tweak1_2_0;
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
 
-          ah0 ^= ch;
-          al0 ^= cl;
-          idx0 = al0;
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
 
 
-          cl = ((uint64_t*) &l1[idx1 & MASK])[0];
-          ch = ((uint64_t*) &l1[idx1 & MASK])[1];
-          lo = __umul128(idx1, cl, &hi);
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
 
-          al1 += hi;
-          ah1 += lo;
+            al1 += hi;
+            ah1 += lo;
 
-          ah1 ^= tweak1_2_1;
-          ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
-          ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
-          ah1 ^= tweak1_2_1;
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
 
-          ah1 ^= ch;
-          al1 ^= cl;
-          idx1 = al1;
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
 
 
-          cl = ((uint64_t*) &l2[idx2 & MASK])[0];
-          ch = ((uint64_t*) &l2[idx2 & MASK])[1];
-          lo = __umul128(idx2, cl, &hi);
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
 
-          al2 += hi;
-          ah2 += lo;
+            al2 += hi;
+            ah2 += lo;
 
-          ah2 ^= tweak1_2_2;
-          ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
-          ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
-          ah2 ^= tweak1_2_2;
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
 
-          ah2 ^= ch;
-          al2 ^= cl;
-          idx2 = al2;
-      }
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+        }
 
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
 
-      keccakf(h0, 24);
-      keccakf(h1, 24);
-      keccakf(h2, 24);
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
 
-      extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
-      extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
-      extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
-  }
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+    }
+
+    inline static void hashPowV2_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
+
+    // triple
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                            size_t size,
+                            uint8_t* __restrict__ output,
+                            ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+        __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+
+        SET_ROUNDING_MODE_UP();
+
+        __m128i division_result_xmm0 = _mm_cvtsi64_si128(h0[12]);
+        __m128i division_result_xmm1 = _mm_cvtsi64_si128(h1[12]);
+        __m128i division_result_xmm2 = _mm_cvtsi64_si128(h2[12]);
+
+        uint64_t sqrt_result0 =  h0[13];
+        uint64_t sqrt_result1 =  h1[13];
+        uint64_t sqrt_result2 =  h2[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+            const __m128i ax2 = _mm_set_epi64x(ah2, al2);
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
+                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, ax0);
+                cx1 = _mm_aesenc_si128(cx1, ax1);
+                cx2 = _mm_aesenc_si128(cx2, ax2);
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1)
+            SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx0);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+            INTEGER_MATH_V2(1, cl, cx1);
+
+            lo = __umul128(idx1, cl, &hi);
+
+            SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            bx11 = bx01;
+            bx01 = cx1;
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+
+            INTEGER_MATH_V2(2, cl, cx2);
+
+            lo = __umul128(idx2, cl, &hi);
+
+            SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi)
+
+            al2 += hi;
+            ah2 += lo;
+
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            bx12 = bx02;
+            bx02 = cx2;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+    }
+
+    inline static void hashPowV3_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
 
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
@@ -3198,197 +4004,7 @@ public:
         extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
     }
 
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
-                            size_t size,
-                            uint8_t* __restrict__ output,
-                            ScratchPad** __restrict__ scratchPad)
-  {
-      keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
-      keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
-      keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
-      keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
-
-      uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
-      uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
-      uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
-      uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
-
-      const uint8_t* l0 = scratchPad[0]->memory;
-      const uint8_t* l1 = scratchPad[1]->memory;
-      const uint8_t* l2 = scratchPad[2]->memory;
-      const uint8_t* l3 = scratchPad[3]->memory;
-      uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
-      uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
-      uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
-      uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
-
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
-
-      uint64_t al0 = h0[0] ^h0[4];
-      uint64_t al1 = h1[0] ^h1[4];
-      uint64_t al2 = h2[0] ^h2[4];
-      uint64_t al3 = h3[0] ^h3[4];
-      uint64_t ah0 = h0[1] ^h0[5];
-      uint64_t ah1 = h1[1] ^h1[5];
-      uint64_t ah2 = h2[1] ^h2[5];
-      uint64_t ah3 = h3[1] ^h3[5];
-
-      __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-      __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-      __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-      __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-
-      uint64_t idx0 = h0[0] ^h0[4];
-      uint64_t idx1 = h1[0] ^h1[4];
-      uint64_t idx2 = h2[0] ^h2[4];
-      uint64_t idx3 = h3[0] ^h3[4];
-
-      for (size_t i = 0; i < ITERATIONS; i++) {
-          __m128i cx0;
-          __m128i cx1;
-          __m128i cx2;
-          __m128i cx3;
-
-          if (SOFT_AES) {
-              cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-              cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-              cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-              cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
-          } else {
-              cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
-              cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
-              cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
-              cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
-
-              cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
-              cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
-              cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
-              cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
-          }
-
-          _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
-          _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
-          _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
-          _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
-
-          static const uint32_t table = 0x75310;
-          uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
-          uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-
-          idx0 = EXTRACT64(cx0);
-          idx1 = EXTRACT64(cx1);
-          idx2 = EXTRACT64(cx2);
-          idx3 = EXTRACT64(cx3);
-
-          bx0 = cx0;
-          bx1 = cx1;
-          bx2 = cx2;
-          bx3 = cx3;
-
-
-          uint64_t hi, lo, cl, ch;
-          cl = ((uint64_t*) &l0[idx0 & MASK])[0];
-          ch = ((uint64_t*) &l0[idx0 & MASK])[1];
-          lo = __umul128(idx0, cl, &hi);
-
-          al0 += hi;
-          ah0 += lo;
-
-          ah0 ^= tweak1_2_0;
-          ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
-          ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
-          ah0 ^= tweak1_2_0;
-
-          ah0 ^= ch;
-          al0 ^= cl;
-          idx0 = al0;
-
-
-          cl = ((uint64_t*) &l1[idx1 & MASK])[0];
-          ch = ((uint64_t*) &l1[idx1 & MASK])[1];
-          lo = __umul128(idx1, cl, &hi);
-
-          al1 += hi;
-          ah1 += lo;
-
-          ah1 ^= tweak1_2_1;
-          ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
-          ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
-          ah1 ^= tweak1_2_1;
-
-          ah1 ^= ch;
-          al1 ^= cl;
-          idx1 = al1;
-
-
-          cl = ((uint64_t*) &l2[idx2 & MASK])[0];
-          ch = ((uint64_t*) &l2[idx2 & MASK])[1];
-          lo = __umul128(idx2, cl, &hi);
-
-          al2 += hi;
-          ah2 += lo;
-
-          ah2 ^= tweak1_2_2;
-          ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
-          ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
-          ah2 ^= tweak1_2_2;
-
-          ah2 ^= ch;
-          al2 ^= cl;
-          idx2 = al2;
-
-
-          cl = ((uint64_t*) &l3[idx3 & MASK])[0];
-          ch = ((uint64_t*) &l3[idx3 & MASK])[1];
-          lo = __umul128(idx3, cl, &hi);
-
-          al3 += hi;
-          ah3 += lo;
-
-          ah3 ^= tweak1_2_3;
-          ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
-          ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
-          ah3 ^= tweak1_2_3;
-
-          ah3 ^= ch;
-          al3 ^= cl;
-          idx3 = al3;
-      }
-
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
-
-      keccakf(h0, 24);
-      keccakf(h1, 24);
-      keccakf(h2, 24);
-      keccakf(h3, 24);
-
-      extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
-      extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
-      extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
-      extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
-  }
-
-    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
                                  size_t size,
                                  uint8_t* __restrict__ output,
                                  ScratchPad** __restrict__ scratchPad)
@@ -3493,6 +4109,429 @@ public:
             bx3 = cx3;
 
 
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
+            lo = __umul128(idx3, cl, &hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ah3 ^= tweak1_2_3;
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+            ah3 ^= tweak1_2_3;
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+    }
+
+    inline static void hashPowV2_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
+
+    // quadruple
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+        __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]);
+        __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+
+        SET_ROUNDING_MODE_UP();
+
+        __m128i division_result_xmm0 = _mm_cvtsi64_si128(h0[12]);
+        __m128i division_result_xmm1 = _mm_cvtsi64_si128(h1[12]);
+        __m128i division_result_xmm2 = _mm_cvtsi64_si128(h2[12]);
+        __m128i division_result_xmm3 = _mm_cvtsi64_si128(h3[12]);
+
+        uint64_t sqrt_result0 =  h0[13];
+        uint64_t sqrt_result1 =  h1[13];
+        uint64_t sqrt_result2 =  h2[13];
+        uint64_t sqrt_result3 =  h3[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+            const __m128i ax2 = _mm_set_epi64x(ah2, al2);
+            const __m128i ax3 = _mm_set_epi64x(ah3, al3);
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
+                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2);
+                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], ax3);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, ax0);
+                cx1 = _mm_aesenc_si128(cx1, ax1);
+                cx2 = _mm_aesenc_si128(cx2, ax2);
+                cx3 = _mm_aesenc_si128(cx3, ax3);
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1)
+            SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2)
+            SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx0);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+
+            INTEGER_MATH_V2(1, cl, cx1);
+
+            lo = __umul128(idx1, cl, &hi);
+
+            SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+            bx11 = bx01;
+            bx01 = cx1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+
+            INTEGER_MATH_V2(2, cl, cx2);
+
+            lo = __umul128(idx2, cl, &hi);
+
+            SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            bx12 = bx02;
+            bx02 = cx2;
+
+
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
+
+            INTEGER_MATH_V2(3, cl, cx3);
+
+            lo = __umul128(idx3, cl, &hi);
+
+            SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+
+            bx13 = bx03;
+            bx03 = cx3;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+    }
+
+    inline static void hashPowV3_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
+
+    inline static void hashLiteTube(const uint8_t* __restrict__ input,
+                                    size_t size,
+                                    uint8_t* __restrict__ output,
+                                    ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+        uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
+            }
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
+            bx3 = cx3;
+
+
             uint64_t hi, lo, cl, ch;
             cl = ((uint64_t*) &l0[idx0 & MASK])[0];
             ch = ((uint64_t*) &l0[idx0 & MASK])[1];
@@ -3806,235 +4845,512 @@ public:
         extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
     }
 
-  inline static void hashPowV2(const uint8_t* __restrict__ input,
+    inline static void hashPowV2(const uint8_t* __restrict__ input,
+                                 size_t size,
+                                 uint8_t* __restrict__ output,
+                                 ScratchPad** __restrict__ scratchPad)
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200);
+
+        uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
+        uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
+        uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
+        uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
+        uint64_t tweak1_2_4 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 4 * size) ^
+                               *(reinterpret_cast<const uint64_t*>(scratchPad[4]->state) + 24));
+
+
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h4, (__m128i*) l4);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+        uint64_t al4 = h4[0] ^h4[4];
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+        uint64_t ah4 = h4[1] ^h4[5];
+
+        __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+        __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+        uint64_t idx4 = h4[0] ^h4[4];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+            __m128i cx4;
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
+                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
+                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
+                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
+                cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+                cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
+                cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
+                cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
+                cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
+                cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4));
+            }
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
+            _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4));
+
+            static const uint32_t table = 0x75310;
+            uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
+            uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+            tmp = reinterpret_cast<const uint8_t*>(&l4[idx4 & MASK])[11];
+            index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
+            ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+            idx4 = EXTRACT64(cx4);
+
+            bx0 = cx0;
+            bx1 = cx1;
+            bx2 = cx2;
+            bx3 = cx3;
+            bx4 = cx4;
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+            lo = __umul128(idx0, cl, &hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ah0 ^= tweak1_2_0;
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+            ah0 ^= tweak1_2_0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
+            lo = __umul128(idx1, cl, &hi);
+
+            al1 += hi;
+            ah1 += lo;
+
+            ah1 ^= tweak1_2_1;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
+            ah1 ^= tweak1_2_1;
+
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
+
+
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
+            lo = __umul128(idx2, cl, &hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ah2 ^= tweak1_2_2;
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+            ah2 ^= tweak1_2_2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
+            lo = __umul128(idx3, cl, &hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ah3 ^= tweak1_2_3;
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+            ah3 ^= tweak1_2_3;
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+
+
+            cl = ((uint64_t*) &l4[idx4 & MASK])[0];
+            ch = ((uint64_t*) &l4[idx4 & MASK])[1];
+            lo = __umul128(idx4, cl, &hi);
+
+            al4 += hi;
+            ah4 += lo;
+
+            ah4 ^= tweak1_2_4;
+            ((uint64_t*) &l4[idx4 & MASK])[0] = al4;
+            ((uint64_t*) &l4[idx4 & MASK])[1] = ah4;
+            ah4 ^= tweak1_2_4;
+
+            ah4 ^= ch;
+            al4 ^= cl;
+            idx4 = al4;
+        }
+
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l4, (__m128i*) h4);
+
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+        keccakf(h4, 24);
+
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
+    }
+
+    inline static void hashPowV2_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
+
+    // quintuple
+    inline static void hashPowV3(const uint8_t* __restrict__ input,
                             size_t size,
                             uint8_t* __restrict__ output,
                             ScratchPad** __restrict__ scratchPad)
-  {
-      keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
-      keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
-      keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
-      keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
-      keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200);
+    {
+        keccak((const uint8_t*) input, (int) size, scratchPad[0]->state, 200);
+        keccak((const uint8_t*) input + size, (int) size, scratchPad[1]->state, 200);
+        keccak((const uint8_t*) input + 2 * size, (int) size, scratchPad[2]->state, 200);
+        keccak((const uint8_t*) input + 3 * size, (int) size, scratchPad[3]->state, 200);
+        keccak((const uint8_t*) input + 4 * size, (int) size, scratchPad[4]->state, 200);
 
-      uint64_t tweak1_2_0 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[0]->state) + 24));
-      uint64_t tweak1_2_1 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[1]->state) + 24));
-      uint64_t tweak1_2_2 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 2 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[2]->state) + 24));
-      uint64_t tweak1_2_3 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 3 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[3]->state) + 24));
-      uint64_t tweak1_2_4 = (*reinterpret_cast<const uint64_t*>(reinterpret_cast<const uint8_t*>(input) + 35 + 4 * size) ^
-                             *(reinterpret_cast<const uint64_t*>(scratchPad[4]->state) + 24));
+        const uint8_t* l0 = scratchPad[0]->memory;
+        const uint8_t* l1 = scratchPad[1]->memory;
+        const uint8_t* l2 = scratchPad[2]->memory;
+        const uint8_t* l3 = scratchPad[3]->memory;
+        const uint8_t* l4 = scratchPad[4]->memory;
+        uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
+        uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
+        uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
+        uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
+        uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
+
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
+        cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h4, (__m128i*) l4);
+
+        uint64_t al0 = h0[0] ^h0[4];
+        uint64_t al1 = h1[0] ^h1[4];
+        uint64_t al2 = h2[0] ^h2[4];
+        uint64_t al3 = h3[0] ^h3[4];
+        uint64_t al4 = h4[0] ^h4[4];
+
+        uint64_t ah0 = h0[1] ^h0[5];
+        uint64_t ah1 = h1[1] ^h1[5];
+        uint64_t ah2 = h2[1] ^h2[5];
+        uint64_t ah3 = h3[1] ^h3[5];
+        uint64_t ah4 = h4[1] ^h4[5];
+
+        __m128i bx00 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
+        __m128i bx01 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
+        __m128i bx02 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
+        __m128i bx03 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
+        __m128i bx04 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+
+        __m128i bx10 = _mm_set_epi64x(h0[9] ^ h0[11], h0[8] ^ h0[10]);
+        __m128i bx11 = _mm_set_epi64x(h1[9] ^ h1[11], h1[8] ^ h1[10]);
+        __m128i bx12 = _mm_set_epi64x(h2[9] ^ h2[11], h2[8] ^ h2[10]);
+        __m128i bx13 = _mm_set_epi64x(h3[9] ^ h3[11], h3[8] ^ h3[10]);
+        __m128i bx14 = _mm_set_epi64x(h4[9] ^ h4[11], h4[8] ^ h4[10]);
+
+        uint64_t idx0 = h0[0] ^h0[4];
+        uint64_t idx1 = h1[0] ^h1[4];
+        uint64_t idx2 = h2[0] ^h2[4];
+        uint64_t idx3 = h3[0] ^h3[4];
+        uint64_t idx4 = h4[0] ^h4[4];
+
+        SET_ROUNDING_MODE_UP();
+
+        __m128i division_result_xmm0 = _mm_cvtsi64_si128(h0[12]);
+        __m128i division_result_xmm1 = _mm_cvtsi64_si128(h1[12]);
+        __m128i division_result_xmm2 = _mm_cvtsi64_si128(h2[12]);
+        __m128i division_result_xmm3 = _mm_cvtsi64_si128(h3[12]);
+        __m128i division_result_xmm4 = _mm_cvtsi64_si128(h4[12]);
+
+        uint64_t sqrt_result0 =  h0[13];
+        uint64_t sqrt_result1 =  h1[13];
+        uint64_t sqrt_result2 =  h2[13];
+        uint64_t sqrt_result3 =  h3[13];
+        uint64_t sqrt_result4 =  h4[13];
+
+        for (size_t i = 0; i < ITERATIONS; i++) {
+            __m128i cx0;
+            __m128i cx1;
+            __m128i cx2;
+            __m128i cx3;
+            __m128i cx4;
+
+            const __m128i ax0 = _mm_set_epi64x(ah0, al0);
+            const __m128i ax1 = _mm_set_epi64x(ah1, al1);
+            const __m128i ax2 = _mm_set_epi64x(ah2, al2);
+            const __m128i ax3 = _mm_set_epi64x(ah3, al3);
+            const __m128i ax4 = _mm_set_epi64x(ah4, al4);
+
+            if (SOFT_AES) {
+                cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], ax0);
+                cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], ax1);
+                cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], ax2);
+                cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], ax3);
+                cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], ax4);
+            } else {
+                cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
+                cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
+                cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
+                cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
+                cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
+
+                cx0 = _mm_aesenc_si128(cx0, ax0);
+                cx1 = _mm_aesenc_si128(cx1, ax1);
+                cx2 = _mm_aesenc_si128(cx2, ax2);
+                cx3 = _mm_aesenc_si128(cx3, ax3);
+                cx4 = _mm_aesenc_si128(cx4, ax4);
+            }
+
+            SHUFFLE_PHASE_1(l0, (idx0&MASK), bx00, bx10, ax0)
+            SHUFFLE_PHASE_1(l1, (idx1&MASK), bx01, bx11, ax1)
+            SHUFFLE_PHASE_1(l2, (idx2&MASK), bx02, bx12, ax2)
+            SHUFFLE_PHASE_1(l3, (idx3&MASK), bx03, bx13, ax3)
+            SHUFFLE_PHASE_1(l4, (idx4&MASK), bx04, bx14, ax4)
+
+            _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx00, cx0));
+            _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx01, cx1));
+            _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx02, cx2));
+            _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx03, cx3));
+            _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx04, cx4));
+
+            idx0 = EXTRACT64(cx0);
+            idx1 = EXTRACT64(cx1);
+            idx2 = EXTRACT64(cx2);
+            idx3 = EXTRACT64(cx3);
+            idx4 = EXTRACT64(cx4);
+
+            uint64_t hi, lo, cl, ch;
+            cl = ((uint64_t*) &l0[idx0 & MASK])[0];
+            ch = ((uint64_t*) &l0[idx0 & MASK])[1];
+
+            INTEGER_MATH_V2(0, cl, cx0);
+
+            lo = __umul128(idx0, cl, &hi);
+
+            SHUFFLE_PHASE_2(l0, (idx0&MASK), bx00, bx10, ax0, lo, hi);
+
+            al0 += hi;
+            ah0 += lo;
+
+            ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
+            ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
+
+            ah0 ^= ch;
+            al0 ^= cl;
+            idx0 = al0;
+
+            bx10 = bx00;
+            bx00 = cx0;
 
 
-      const uint8_t* l0 = scratchPad[0]->memory;
-      const uint8_t* l1 = scratchPad[1]->memory;
-      const uint8_t* l2 = scratchPad[2]->memory;
-      const uint8_t* l3 = scratchPad[3]->memory;
-      const uint8_t* l4 = scratchPad[4]->memory;
-      uint64_t* h0 = reinterpret_cast<uint64_t*>(scratchPad[0]->state);
-      uint64_t* h1 = reinterpret_cast<uint64_t*>(scratchPad[1]->state);
-      uint64_t* h2 = reinterpret_cast<uint64_t*>(scratchPad[2]->state);
-      uint64_t* h3 = reinterpret_cast<uint64_t*>(scratchPad[3]->state);
-      uint64_t* h4 = reinterpret_cast<uint64_t*>(scratchPad[4]->state);
+            cl = ((uint64_t*) &l1[idx1 & MASK])[0];
+            ch = ((uint64_t*) &l1[idx1 & MASK])[1];
 
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h0, (__m128i*) l0);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h1, (__m128i*) l1);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h2, (__m128i*) l2);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h3, (__m128i*) l3);
-      cn_explode_scratchpad<MEM, SOFT_AES>((__m128i*) h4, (__m128i*) l4);
+            INTEGER_MATH_V2(1, cl, cx1);
 
-      uint64_t al0 = h0[0] ^h0[4];
-      uint64_t al1 = h1[0] ^h1[4];
-      uint64_t al2 = h2[0] ^h2[4];
-      uint64_t al3 = h3[0] ^h3[4];
-      uint64_t al4 = h4[0] ^h4[4];
-      uint64_t ah0 = h0[1] ^h0[5];
-      uint64_t ah1 = h1[1] ^h1[5];
-      uint64_t ah2 = h2[1] ^h2[5];
-      uint64_t ah3 = h3[1] ^h3[5];
-      uint64_t ah4 = h4[1] ^h4[5];
+            lo = __umul128(idx1, cl, &hi);
 
-      __m128i bx0 = _mm_set_epi64x(h0[3] ^ h0[7], h0[2] ^ h0[6]);
-      __m128i bx1 = _mm_set_epi64x(h1[3] ^ h1[7], h1[2] ^ h1[6]);
-      __m128i bx2 = _mm_set_epi64x(h2[3] ^ h2[7], h2[2] ^ h2[6]);
-      __m128i bx3 = _mm_set_epi64x(h3[3] ^ h3[7], h3[2] ^ h3[6]);
-      __m128i bx4 = _mm_set_epi64x(h4[3] ^ h4[7], h4[2] ^ h4[6]);
+            SHUFFLE_PHASE_2(l1, (idx1&MASK), bx01, bx11, ax1, lo, hi);
 
-      uint64_t idx0 = h0[0] ^h0[4];
-      uint64_t idx1 = h1[0] ^h1[4];
-      uint64_t idx2 = h2[0] ^h2[4];
-      uint64_t idx3 = h3[0] ^h3[4];
-      uint64_t idx4 = h4[0] ^h4[4];
+            al1 += hi;
+            ah1 += lo;
 
-      for (size_t i = 0; i < ITERATIONS; i++) {
-          __m128i cx0;
-          __m128i cx1;
-          __m128i cx2;
-          __m128i cx3;
-          __m128i cx4;
+            ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
+            ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
 
-          if (SOFT_AES) {
-              cx0 = soft_aesenc((uint32_t*)&l0[idx0 & MASK], _mm_set_epi64x(ah0, al0));
-              cx1 = soft_aesenc((uint32_t*)&l1[idx1 & MASK], _mm_set_epi64x(ah1, al1));
-              cx2 = soft_aesenc((uint32_t*)&l2[idx2 & MASK], _mm_set_epi64x(ah2, al2));
-              cx3 = soft_aesenc((uint32_t*)&l3[idx3 & MASK], _mm_set_epi64x(ah3, al3));
-              cx4 = soft_aesenc((uint32_t*)&l4[idx4 & MASK], _mm_set_epi64x(ah4, al4));
-          } else {
-              cx0 = _mm_load_si128((__m128i*) &l0[idx0 & MASK]);
-              cx1 = _mm_load_si128((__m128i*) &l1[idx1 & MASK]);
-              cx2 = _mm_load_si128((__m128i*) &l2[idx2 & MASK]);
-              cx3 = _mm_load_si128((__m128i*) &l3[idx3 & MASK]);
-              cx4 = _mm_load_si128((__m128i*) &l4[idx4 & MASK]);
+            ah1 ^= ch;
+            al1 ^= cl;
+            idx1 = al1;
 
-              cx0 = _mm_aesenc_si128(cx0, _mm_set_epi64x(ah0, al0));
-              cx1 = _mm_aesenc_si128(cx1, _mm_set_epi64x(ah1, al1));
-              cx2 = _mm_aesenc_si128(cx2, _mm_set_epi64x(ah2, al2));
-              cx3 = _mm_aesenc_si128(cx3, _mm_set_epi64x(ah3, al3));
-              cx4 = _mm_aesenc_si128(cx4, _mm_set_epi64x(ah4, al4));
-          }
-
-          _mm_store_si128((__m128i*) &l0[idx0 & MASK], _mm_xor_si128(bx0, cx0));
-          _mm_store_si128((__m128i*) &l1[idx1 & MASK], _mm_xor_si128(bx1, cx1));
-          _mm_store_si128((__m128i*) &l2[idx2 & MASK], _mm_xor_si128(bx2, cx2));
-          _mm_store_si128((__m128i*) &l3[idx3 & MASK], _mm_xor_si128(bx3, cx3));
-          _mm_store_si128((__m128i*) &l4[idx4 & MASK], _mm_xor_si128(bx4, cx4));
-
-          static const uint32_t table = 0x75310;
-          uint8_t tmp = reinterpret_cast<const uint8_t*>(&l0[idx0 & MASK])[11];
-          uint8_t index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l0[idx0 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l1[idx1 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l1[idx1 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l2[idx2 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l2[idx2 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l3[idx3 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l3[idx3 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-          tmp = reinterpret_cast<const uint8_t*>(&l4[idx4 & MASK])[11];
-          index = (((tmp >> INDEX_SHIFT) & 6) | (tmp & 1)) << 1;
-          ((uint8_t*)(&l4[idx4 & MASK]))[11] = tmp ^ ((table >> index) & 0x30);
-
-          idx0 = EXTRACT64(cx0);
-          idx1 = EXTRACT64(cx1);
-          idx2 = EXTRACT64(cx2);
-          idx3 = EXTRACT64(cx3);
-          idx4 = EXTRACT64(cx4);
-
-          bx0 = cx0;
-          bx1 = cx1;
-          bx2 = cx2;
-          bx3 = cx3;
-          bx4 = cx4;
-
-          uint64_t hi, lo, cl, ch;
-          cl = ((uint64_t*) &l0[idx0 & MASK])[0];
-          ch = ((uint64_t*) &l0[idx0 & MASK])[1];
-          lo = __umul128(idx0, cl, &hi);
-
-          al0 += hi;
-          ah0 += lo;
-
-          ah0 ^= tweak1_2_0;
-          ((uint64_t*) &l0[idx0 & MASK])[0] = al0;
-          ((uint64_t*) &l0[idx0 & MASK])[1] = ah0;
-          ah0 ^= tweak1_2_0;
-
-          ah0 ^= ch;
-          al0 ^= cl;
-          idx0 = al0;
+            bx11 = bx01;
+            bx01 = cx1;
 
 
-          cl = ((uint64_t*) &l1[idx1 & MASK])[0];
-          ch = ((uint64_t*) &l1[idx1 & MASK])[1];
-          lo = __umul128(idx1, cl, &hi);
+            cl = ((uint64_t*) &l2[idx2 & MASK])[0];
+            ch = ((uint64_t*) &l2[idx2 & MASK])[1];
 
-          al1 += hi;
-          ah1 += lo;
+            INTEGER_MATH_V2(2, cl, cx2);
 
-          ah1 ^= tweak1_2_1;
-          ((uint64_t*) &l1[idx1 & MASK])[0] = al1;
-          ((uint64_t*) &l1[idx1 & MASK])[1] = ah1;
-          ah1 ^= tweak1_2_1;
+            lo = __umul128(idx2, cl, &hi);
 
-          ah1 ^= ch;
-          al1 ^= cl;
-          idx1 = al1;
+            SHUFFLE_PHASE_2(l2, (idx2&MASK), bx02, bx12, ax2, lo, hi);
+
+            al2 += hi;
+            ah2 += lo;
+
+            ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
+            ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
+
+            ah2 ^= ch;
+            al2 ^= cl;
+            idx2 = al2;
+
+            bx12 = bx02;
+            bx02 = cx2;
 
 
-          cl = ((uint64_t*) &l2[idx2 & MASK])[0];
-          ch = ((uint64_t*) &l2[idx2 & MASK])[1];
-          lo = __umul128(idx2, cl, &hi);
+            cl = ((uint64_t*) &l3[idx3 & MASK])[0];
+            ch = ((uint64_t*) &l3[idx3 & MASK])[1];
 
-          al2 += hi;
-          ah2 += lo;
+            INTEGER_MATH_V2(3, cl, cx3);
 
-          ah2 ^= tweak1_2_2;
-          ((uint64_t*) &l2[idx2 & MASK])[0] = al2;
-          ((uint64_t*) &l2[idx2 & MASK])[1] = ah2;
-          ah2 ^= tweak1_2_2;
+            lo = __umul128(idx3, cl, &hi);
 
-          ah2 ^= ch;
-          al2 ^= cl;
-          idx2 = al2;
+            SHUFFLE_PHASE_2(l3, (idx3&MASK), bx03, bx13, ax3, lo, hi);
+
+            al3 += hi;
+            ah3 += lo;
+
+            ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
+            ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
+
+            ah3 ^= ch;
+            al3 ^= cl;
+            idx3 = al3;
+
+            bx13 = bx03;
+            bx03 = cx3;
 
 
-          cl = ((uint64_t*) &l3[idx3 & MASK])[0];
-          ch = ((uint64_t*) &l3[idx3 & MASK])[1];
-          lo = __umul128(idx3, cl, &hi);
+            cl = ((uint64_t*) &l4[idx4 & MASK])[0];
+            ch = ((uint64_t*) &l4[idx4 & MASK])[1];
 
-          al3 += hi;
-          ah3 += lo;
+            INTEGER_MATH_V2(4, cl, cx4);
 
-          ah3 ^= tweak1_2_3;
-          ((uint64_t*) &l3[idx3 & MASK])[0] = al3;
-          ((uint64_t*) &l3[idx3 & MASK])[1] = ah3;
-          ah3 ^= tweak1_2_3;
+            lo = __umul128(idx4, cl, &hi);
 
-          ah3 ^= ch;
-          al3 ^= cl;
-          idx3 = al3;
+            SHUFFLE_PHASE_2(l4, (idx4&MASK), bx04, bx14, ax4, lo, hi);
 
+            al4 += hi;
+            ah4 += lo;
 
-          cl = ((uint64_t*) &l4[idx4 & MASK])[0];
-          ch = ((uint64_t*) &l4[idx4 & MASK])[1];
-          lo = __umul128(idx4, cl, &hi);
+            ((uint64_t*) &l4[idx4 & MASK])[0] = al4;
+            ((uint64_t*) &l4[idx4 & MASK])[1] = ah4;
 
-          al4 += hi;
-          ah4 += lo;
+            ah4 ^= ch;
+            al4 ^= cl;
+            idx4 = al4;
 
-          ah4 ^= tweak1_2_4;
-          ((uint64_t*) &l4[idx4 & MASK])[0] = al4;
-          ((uint64_t*) &l4[idx4 & MASK])[1] = ah4;
-          ah4 ^= tweak1_2_4;
+            bx14 = bx04;
+            bx04 = cx4;
+        }
 
-          ah4 ^= ch;
-          al4 ^= cl;
-          idx4 = al4;
-      }
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
+        cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l4, (__m128i*) h4);
 
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l0, (__m128i*) h0);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l1, (__m128i*) h1);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l2, (__m128i*) h2);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l3, (__m128i*) h3);
-      cn_implode_scratchpad<MEM, SOFT_AES>((__m128i*) l4, (__m128i*) h4);
+        keccakf(h0, 24);
+        keccakf(h1, 24);
+        keccakf(h2, 24);
+        keccakf(h3, 24);
+        keccakf(h4, 24);
 
-      keccakf(h0, 24);
-      keccakf(h1, 24);
-      keccakf(h2, 24);
-      keccakf(h3, 24);
-      keccakf(h4, 24);
+        extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
+        extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
+        extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
+        extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
+        extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
+    }
 
-      extra_hashes[scratchPad[0]->state[0] & 3](scratchPad[0]->state, 200, output);
-      extra_hashes[scratchPad[1]->state[0] & 3](scratchPad[1]->state, 200, output + 32);
-      extra_hashes[scratchPad[2]->state[0] & 3](scratchPad[2]->state, 200, output + 64);
-      extra_hashes[scratchPad[3]->state[0] & 3](scratchPad[3]->state, 200, output + 96);
-      extra_hashes[scratchPad[4]->state[0] & 3](scratchPad[4]->state, 200, output + 128);
-  }
+    inline static void hashPowV3_asm(const uint8_t* __restrict__ input,
+                                     size_t size,
+                                     uint8_t* __restrict__ output,
+                                     ScratchPad** __restrict__ scratchPad,
+                                     AsmOptimization asmOptimization)
+    {
+        // not supported
+    }
 
     inline static void hashLiteTube(const uint8_t* __restrict__ input,
                                  size_t size,
diff --git a/src/crypto/asm/cn_main_loop.S b/src/crypto/asm/cn_main_loop.S
new file mode 100644
index 00000000..555f3fb1
--- /dev/null
+++ b/src/crypto/asm/cn_main_loop.S
@@ -0,0 +1,88 @@
+#define ALIGN .align
+.intel_syntax noprefix
+#ifdef __APPLE__
+#   define FN_PREFIX(fn) _ ## fn
+.text
+#else
+#   define FN_PREFIX(fn) fn
+.section .text
+#endif
+.global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
+.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
+
+.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+FN_PREFIX(cnv1_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cnv1_mainloop_sandybridge.inc"
+	add rsp, 48
+	ret 0
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+FN_PREFIX(cnv2_mainloop_ivybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cnv2_main_loop_ivybridge.inc"
+	add rsp, 48
+	ret 0
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+FN_PREFIX(cnv2_mainloop_ryzen_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cnv2_main_loop_ryzen.inc"
+	add rsp, 48
+	ret 0
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	mov rdx, rsi
+	#include "cnv2_double_main_loop_sandybridge.inc"
+	add rsp, 48
+	ret 0
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cnv1_mainloop_soft_aes_sandybridge.inc"
+	add rsp, 48
+	ret 0
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
+	sub rsp, 48
+	mov rcx, rdi
+	#include "cnv2_mainloop_soft_aes_sandybridge.inc"
+	add rsp, 48
+	ret 0
diff --git a/src/crypto/asm/cn_main_loop.asm b/src/crypto/asm/cn_main_loop.asm
new file mode 100644
index 00000000..00cf6d09
--- /dev/null
+++ b/src/crypto/asm/cn_main_loop.asm
@@ -0,0 +1,71 @@
+_TEXT_CN_MAINLOOP SEGMENT PAGE READ EXECUTE
+PUBLIC cnv1_mainloop_sandybridge_asm
+PUBLIC cnv2_mainloop_ivybridge_asm
+PUBLIC cnv2_mainloop_ryzen_asm
+PUBLIC cnv2_double_mainloop_sandybridge_asm
+
+PUBLIC cnv1_mainloop_soft_aes_sandybridge_asm
+PUBLIC cnv2_mainloop_soft_aes_sandybridge_asm
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+cnv1_mainloop_sandybridge_asm PROC
+	INCLUDE cnv1_mainloop_sandybridge.inc
+	ret 0
+cnv1_mainloop_sandybridge_asm ENDP
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+cnv2_mainloop_ivybridge_asm PROC
+	INCLUDE cnv2_main_loop_ivybridge.inc
+	ret 0
+cnv2_mainloop_ivybridge_asm ENDP
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+cnv2_mainloop_ryzen_asm PROC
+	INCLUDE cnv2_main_loop_ryzen.inc
+	ret 0
+cnv2_mainloop_ryzen_asm ENDP
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+cnv2_double_mainloop_sandybridge_asm PROC
+	INCLUDE cnv2_double_main_loop_sandybridge.inc
+	ret 0
+cnv2_double_mainloop_sandybridge_asm ENDP
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+cnv1_mainloop_soft_aes_sandybridge_asm PROC
+	INCLUDE cnv1_mainloop_soft_aes_sandybridge.inc
+	ret 0
+cnv1_mainloop_soft_aes_sandybridge_asm ENDP
+
+#ifdef __APPLE__
+ALIGN 16
+#else
+ALIGN 64
+#endif
+cnv2_mainloop_soft_aes_sandybridge_asm PROC
+	INCLUDE cnv2_mainloop_soft_aes_sandybridge.inc
+	ret 0
+cnv2_mainloop_soft_aes_sandybridge_asm ENDP
+
+_TEXT_CN_MAINLOOP ENDS
+END
diff --git a/src/crypto/asm/cn_main_loop_win_gcc.S b/src/crypto/asm/cn_main_loop_win_gcc.S
new file mode 100644
index 00000000..4edb17f8
--- /dev/null
+++ b/src/crypto/asm/cn_main_loop_win_gcc.S
@@ -0,0 +1,42 @@
+#define ALIGN .align
+.intel_syntax noprefix
+#   define FN_PREFIX(fn) fn
+.section .text
+
+.global FN_PREFIX(cnv1_mainloop_sandybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_ivybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_ryzen_asm)
+.global FN_PREFIX(cnv2_double_mainloop_sandybridge_asm)
+
+.global FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm)
+.global FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm)
+
+ALIGN 64
+FN_PREFIX(cnv1_mainloop_sandybridge_asm):
+	#include "cnv1_mainloop_sandybridge.inc"
+	ret 0
+
+ALIGN 64
+FN_PREFIX(cnv2_mainloop_ivybridge_asm):
+	#include "cnv2_main_loop_ivybridge.inc"
+	ret 0
+
+ALIGN 64
+FN_PREFIX(cnv2_mainloop_ryzen_asm):
+	#include "cnv2_main_loop_ryzen.inc"
+	ret 0
+
+ALIGN 64
+FN_PREFIX(cnv2_double_mainloop_sandybridge_asm):
+	#include "cnv2_double_main_loop_sandybridge.inc"
+	ret 0
+
+ALIGN 64
+FN_PREFIX(cnv1_mainloop_soft_aes_sandybridge_asm):
+	#include "cnv1_mainloop_soft_aes_sandybridge.inc"
+	ret 0
+
+ALIGN 64
+FN_PREFIX(cnv2_mainloop_soft_aes_sandybridge_asm):
+	#include "cnv2_mainloop_soft_aes_sandybridge.inc"
+	ret 0
\ No newline at end of file
diff --git a/src/crypto/asm/cnv1_mainloop_sandybridge.inc b/src/crypto/asm/cnv1_mainloop_sandybridge.inc
new file mode 100644
index 00000000..89cc15e8
--- /dev/null
+++ b/src/crypto/asm/cnv1_mainloop_sandybridge.inc
@@ -0,0 +1,74 @@
+	mov	QWORD PTR [rsp+8], rbx
+	mov	QWORD PTR [rsp+16], rbp
+	mov	QWORD PTR [rsp+24], rsi
+	mov	QWORD PTR [rsp+32], rdi
+	push	r14
+	push	r15
+	mov	rax, QWORD PTR [rcx+48]
+	mov	ebp, 524288
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [rcx+256]
+	mov	rdi, QWORD PTR [rcx+40]
+	movq	xmm0, rdx
+	xor	rdi, QWORD PTR [rcx+8]
+	mov	rdx, r8
+	mov	r15, QWORD PTR [rcx+264]
+	and	edx, 2097136
+	mov	r14, QWORD PTR [rax+35]
+	xor	r14, QWORD PTR [rcx+192]
+	mov	rsi, QWORD PTR [rcx+224]
+	punpcklqdq xmm3, xmm0
+	movdqu	xmm2, XMMWORD PTR [rdx+rsi]
+
+	#ifdef __APPLE__
+	ALIGN 16
+	#else
+	ALIGN 64
+	#endif
+cnv1_mainloop_sandybridge:
+	movq	xmm0, rdi
+	movq	xmm1, r8
+	punpcklqdq xmm1, xmm0
+	aesenc	xmm2, xmm1
+	movq r10, xmm2
+	mov r9d, r10d
+	and r9d, 2097136
+	add r9, rsi
+	movdqa	xmm0, xmm2
+	pxor	xmm0, xmm3
+	movdqa	xmm3, xmm2
+	movdqu	XMMWORD PTR [rdx+rsi], xmm0
+	psrldq	xmm0, 11
+	movq	rax, xmm0
+	movzx	eax, al
+	movzx	eax, BYTE PTR [rax+r15]
+	mov	BYTE PTR [rsi+rdx+11], al
+	mov	rbx, QWORD PTR [r9]
+	mov	r11, QWORD PTR [r9+8]
+	mov	rax, rbx
+	mul	r10
+	add	r8, rdx
+	mov	QWORD PTR [r9], r8
+	add	rdi, rax
+	mov	rax, r14
+	xor	rax, rdi
+	mov	QWORD PTR [r9+8], rax
+	xor	r8, rbx
+	mov	rdx, r8
+	and	edx, 2097136
+	movdqu	xmm2, XMMWORD PTR [rdx+rsi]
+	xor	rdi, r11
+	dec	ebp
+	jne	cnv1_mainloop_sandybridge
+
+	mov	rbx, QWORD PTR [rsp+24]
+	mov	rbp, QWORD PTR [rsp+32]
+	mov	rsi, QWORD PTR [rsp+40]
+	mov	rdi, QWORD PTR [rsp+48]
+	pop	r15
+	pop	r14
diff --git a/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc
new file mode 100644
index 00000000..5a28185e
--- /dev/null
+++ b/src/crypto/asm/cnv1_mainloop_soft_aes_sandybridge.inc
@@ -0,0 +1,166 @@
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 72
+
+	movaps XMMWORD PTR [rsp], xmm6
+	movaps XMMWORD PTR [rsp+16], xmm7
+	movaps XMMWORD PTR [rsp+32], xmm8
+	movaps XMMWORD PTR [rsp+48], xmm9
+
+	mov	rax, QWORD PTR [rcx+48]
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	movq	xmm4, rax
+	mov	rax, QWORD PTR [rcx+256]
+	mov	r13, QWORD PTR [rcx+40]
+	movq	xmm0, rdx
+	xor	r13, QWORD PTR [rcx+8]
+	mov	rdx, r8
+	mov	rdi, QWORD PTR [rcx+224]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rax+35]
+	xor	rax, QWORD PTR [rcx+192]
+	movq	xmm5, rax
+	movq	xmm8, rdi
+	punpcklqdq xmm4, xmm0
+	mov	QWORD PTR [rsp+64], rdx
+
+	movq xmm6, rcx
+	mov rax, QWORD PTR [rcx+264]
+	movq xmm7, rax
+
+	mov eax, 524288
+
+	#ifdef __APPLE__
+	ALIGN 16
+	#else
+	ALIGN 64
+	#endif
+cnv1_mainloop_soft_aes_sandybridge:
+	movq xmm9, rax
+	mov	r12, QWORD PTR [rcx+272]
+	mov	esi, DWORD PTR [rdx+rdi]
+	mov	r10d, DWORD PTR [rdx+rdi+4]
+	mov	ebp, DWORD PTR [rdx+rdi+12]
+	mov	r14d, DWORD PTR [rdx+rdi+8]
+	mov	rdx, QWORD PTR [rsp+64]
+	movzx	ecx, sil
+	shr	esi, 8
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	add	ebp, 256
+	movd	xmm1, r11d
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movq rdi, xmm8
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	punpckldq xmm2, xmm1
+	movq	xmm1, r8
+	xor	eax, DWORD PTR [r12+rcx*4]
+	xor	eax, r15d
+	movd	xmm3, eax
+	movq	rax, xmm7
+	punpckldq xmm3, xmm0
+	movq	xmm0, r13
+	punpcklqdq xmm1, xmm0
+	punpckldq xmm3, xmm2
+	pxor	xmm3, xmm1
+	movq	r9, xmm3
+	mov	r10d, r9d
+	and	r10d, 2097136
+	movdqa	xmm0, xmm3
+	pxor	xmm0, xmm4
+	movdqu	XMMWORD PTR [rdx+rdi], xmm0
+	psrldq	xmm0, 11
+	movq	rcx, xmm0
+	movzx	ecx, cl
+	mov	cl, BYTE PTR [rcx+rax]
+	mov	BYTE PTR [rdi+rdx+11], cl
+	mov	rbx, QWORD PTR [r10+rdi]
+	mov rcx, r9
+	lea	r9, QWORD PTR [r10+rdi]
+	mov	r11, QWORD PTR [r9+8]
+	mov	rax, rbx
+	movdqa	xmm4, xmm3
+	mul	rcx
+	movq	rcx, xmm6
+	add	r8, rdx
+	add	r13, rax
+	movq	rax, xmm5
+	xor	rax, r13
+	mov	QWORD PTR [r9], r8
+	xor	r8, rbx
+	mov	QWORD PTR [r9+8], rax
+	movq rax, xmm9
+	mov	rdx, r8
+	xor	r13, r11
+	and	edx, 2097136
+	mov	QWORD PTR [rsp+64], rdx
+	sub eax, 1
+	jne	cnv1_mainloop_soft_aes_sandybridge
+
+	movaps xmm6, XMMWORD PTR [rsp]
+	movaps xmm7, XMMWORD PTR [rsp+16]
+	movaps xmm8, XMMWORD PTR [rsp+32]
+	movaps xmm9, XMMWORD PTR [rsp+48]
+
+	add	rsp, 72
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
diff --git a/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc b/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc
new file mode 100644
index 00000000..1ea871f3
--- /dev/null
+++ b/src/crypto/asm/cnv2_double_main_loop_sandybridge.inc
@@ -0,0 +1,414 @@
+	mov	rax, rsp
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 184
+
+	stmxcsr DWORD PTR [rsp+272]
+	mov DWORD PTR [rsp+276], 24448
+	ldmxcsr DWORD PTR [rsp+276]
+
+	mov	r13, QWORD PTR [rcx+224]
+	mov	r9, rdx
+	mov	r10, QWORD PTR [rcx+32]
+	mov	r8, rcx
+	xor	r10, QWORD PTR [rcx]
+	mov	r14d, 524288
+	mov	r11, QWORD PTR [rcx+40]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rsi, QWORD PTR [rdx+224]
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	rdi, QWORD PTR [r9+32]
+	xor	rdi, QWORD PTR [r9]
+	mov	rbp, QWORD PTR [r9+40]
+	xor	rbp, QWORD PTR [r9+8]
+	movq	xmm0, rdx
+	movaps	XMMWORD PTR [rax-88], xmm6
+	movaps	XMMWORD PTR [rax-104], xmm7
+	movaps	XMMWORD PTR [rax-120], xmm8
+	movaps	XMMWORD PTR [rsp+112], xmm9
+	movaps	XMMWORD PTR [rsp+96], xmm10
+	movaps	XMMWORD PTR [rsp+80], xmm11
+	movaps	XMMWORD PTR [rsp+64], xmm12
+	movaps	XMMWORD PTR [rsp+48], xmm13
+	movaps	XMMWORD PTR [rsp+32], xmm14
+	movaps	XMMWORD PTR [rsp+16], xmm15
+	mov	rdx, r10
+	movq	xmm4, QWORD PTR [r8+96]
+	and	edx, 2097136
+	mov	rax, QWORD PTR [rcx+48]
+	xorps	xmm13, xmm13
+	xor	rax, QWORD PTR [rcx+16]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r8+72]
+	movq	xmm5, QWORD PTR [r8+104]
+	movq	xmm7, rax
+
+	mov eax, 1
+	shl rax, 52
+	movq xmm14, rax
+	punpcklqdq xmm14, xmm14
+
+	mov eax, 1023
+	shl rax, 52
+	movq xmm12, rax
+	punpcklqdq xmm12, xmm12
+
+	mov	rax, QWORD PTR [r8+80]
+	xor	rax, QWORD PTR [r8+64]
+	punpcklqdq xmm7, xmm0
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r9+56]
+	xor	rcx, QWORD PTR [r9+24]
+	movq	xmm3, rax
+	mov	rax, QWORD PTR [r9+48]
+	xor	rax, QWORD PTR [r9+16]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp], r13
+	mov	rcx, QWORD PTR [r9+88]
+	xor	rcx, QWORD PTR [r9+72]
+	movq	xmm6, rax
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	punpcklqdq xmm6, xmm0
+	movq	xmm0, rcx
+	mov	QWORD PTR [rsp+256], r10
+	mov	rcx, rdi
+	mov	QWORD PTR [rsp+264], r11
+	movq	xmm8, rax
+	and	ecx, 2097136
+	punpcklqdq xmm8, xmm0
+	movq	xmm0, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	xmm0, QWORD PTR [r9+104]
+	lea	r8, QWORD PTR [rcx+rsi]
+	movdqu	xmm11, XMMWORD PTR [r8]
+	punpcklqdq xmm5, xmm0
+	lea	r9, QWORD PTR [rdx+r13]
+	movdqu	xmm15, XMMWORD PTR [r9]
+
+	#ifdef __APPLE__
+        ALIGN 16
+        #else
+        ALIGN 64
+        #endif
+main_loop_double_sandybridge:
+	movdqu	xmm9, xmm15
+	mov eax, edx
+	mov ebx, edx
+	xor eax, 16
+	xor ebx, 32
+	xor edx, 48
+
+	movq	xmm0, r11
+	movq	xmm2, r10
+	punpcklqdq xmm2, xmm0
+	aesenc	xmm9, xmm2
+
+	movdqu	xmm0, XMMWORD PTR [rax+r13]
+	movdqu	xmm1, XMMWORD PTR [rbx+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [rbx+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [rdx+r13]
+	movdqu	XMMWORD PTR [rdx+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [rax+r13], xmm0
+
+	movq	r11, xmm9
+	mov	edx, r11d
+	and	edx, 2097136
+	movdqa	xmm0, xmm9
+	pxor	xmm0, xmm7
+	movdqu	XMMWORD PTR [r9], xmm0
+
+	lea	rbx, QWORD PTR [rdx+r13]
+	mov	r10, QWORD PTR [rdx+r13]
+
+	movdqu	xmm10, xmm11
+	movq	xmm0, rbp
+	movq	xmm11, rdi
+	punpcklqdq xmm11, xmm0
+	aesenc	xmm10, xmm11
+
+	mov eax, ecx
+	mov r12d, ecx
+	xor eax, 16
+	xor r12d, 32
+	xor ecx, 48
+
+	movdqu	xmm0, XMMWORD PTR [rax+rsi]
+	paddq	xmm0, xmm6
+	movdqu	xmm1, XMMWORD PTR [r12+rsi]
+	movdqu	XMMWORD PTR [r12+rsi], xmm0
+	paddq	xmm1, xmm11
+	movdqu	xmm0, XMMWORD PTR [rcx+rsi]
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	paddq	xmm0, xmm8
+	movdqu	XMMWORD PTR [rax+rsi], xmm0
+
+	movq	rcx, xmm10
+	and	ecx, 2097136
+
+	movdqa	xmm0, xmm10
+	pxor	xmm0, xmm6
+	movdqu	XMMWORD PTR [r8], xmm0
+	mov r12, QWORD PTR [rcx+rsi]
+
+	mov	r9, QWORD PTR [rbx+8]
+
+	xor edx, 16
+	mov r8d, edx
+	mov r15d, edx
+
+	movq	rdx, xmm5
+	shl	rdx, 32
+	movq	rax, xmm4
+	xor	rdx, rax
+	xor	r10, rdx
+	mov	rax, r10
+	mul	r11
+	mov r11d, r8d
+	xor r11d, 48
+	movq xmm0, rdx
+	xor rdx, [r11+r13]
+	movq xmm1, rax
+	xor rax, [r11+r13+8]
+	punpcklqdq xmm0, xmm1
+
+	pxor xmm0, XMMWORD PTR [r8+r13]
+	xor	r8d, 32
+	movdqu	xmm1, XMMWORD PTR [r11+r13]
+	paddq	xmm0, xmm7
+	paddq	xmm1, xmm2
+	movdqu	XMMWORD PTR [r11+r13], xmm0
+	movdqu	xmm0, XMMWORD PTR [r8+r13]
+	movdqu	XMMWORD PTR [r8+r13], xmm1
+	paddq	xmm0, xmm3
+	movdqu	XMMWORD PTR [r15+r13], xmm0
+
+	mov	r11, QWORD PTR [rsp+256]
+	add	r11, rdx
+	mov	rdx, QWORD PTR [rsp+264]
+	add	rdx, rax
+	mov	QWORD PTR [rbx], r11
+	xor	r11, r10
+	mov	QWORD PTR [rbx+8], rdx
+	xor	rdx, r9
+	mov	QWORD PTR [rsp+256], r11
+	and	r11d, 2097136
+	mov	QWORD PTR [rsp+264], rdx
+	mov	QWORD PTR [rsp+8], r11
+	lea	r15, QWORD PTR [r11+r13]
+	movdqu xmm15, XMMWORD PTR [r11+r13]
+	lea	r13, QWORD PTR [rsi+rcx]
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movaps	xmm2, xmm13
+	movq	r10, xmm0
+	psllq	xmm5, 1
+	shl	r10, 32
+	movdqa	xmm0, xmm9
+	psrldq	xmm0, 8
+	movdqa	xmm1, xmm10
+	movq	r11, xmm0
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	psrldq	xmm4, 8
+	movaps	xmm0, xmm13
+	movq	rax, xmm4
+	xor	r10, rax
+	movaps	xmm1, xmm13
+	xor	r10, r12
+	lea	rax, QWORD PTR [r11+1]
+	shr	rax, 1
+	movdqa	xmm3, xmm9
+	punpcklqdq xmm3, xmm10
+	paddq	xmm5, xmm3
+	movq	rdx, xmm5
+	psrldq	xmm5, 8
+	cvtsi2sd xmm2, rax
+	or	edx, -2147483647
+	lea	rax, QWORD PTR [r8+1]
+	shr	rax, 1
+	movq	r9, xmm5
+	cvtsi2sd xmm0, rax
+	or	r9d, -2147483647
+	cvtsi2sd xmm1, rdx
+	unpcklpd xmm2, xmm0
+	movaps	xmm0, xmm13
+	cvtsi2sd xmm0, r9
+	unpcklpd xmm1, xmm0
+	divpd	xmm2, xmm1
+	paddq	xmm2, xmm14
+	cvttsd2si rax, xmm2
+	psrldq	xmm2, 8
+	mov	rbx, rax
+	imul	rax, rdx
+	sub	r11, rax
+	js	div_fix_1_sandybridge
+div_fix_1_ret_sandybridge:
+
+	cvttsd2si rdx, xmm2
+	mov	rax, rdx
+	imul	rax, r9
+	movd	xmm2, r11d
+	movd	xmm4, ebx
+	sub	r8, rax
+	js	div_fix_2_sandybridge
+div_fix_2_ret_sandybridge:
+
+	movd	xmm1, r8d
+	movd	xmm0, edx
+	punpckldq xmm2, xmm1
+	punpckldq xmm4, xmm0
+	punpckldq xmm4, xmm2
+	paddq	xmm3, xmm4
+	movdqa	xmm0, xmm3
+	psrlq	xmm0, 12
+	paddq	xmm0, xmm12
+	sqrtpd	xmm1, xmm0
+	movq	r9, xmm1
+	movdqa xmm5, xmm1
+	psrlq xmm5, 19
+	test	r9, 524287
+	je	sqrt_fix_1_sandybridge
+sqrt_fix_1_ret_sandybridge:
+
+	movq r9, xmm10
+	psrldq	xmm1, 8
+	movq	r8, xmm1
+	test	r8, 524287
+	je	sqrt_fix_2_sandybridge
+sqrt_fix_2_ret_sandybridge:
+
+	mov r12d, ecx
+	mov r8d, ecx
+	xor r12d, 16
+	xor r8d, 32
+	xor ecx, 48
+	mov	rax, r10
+	mul	r9
+	movq xmm0, rax
+	movq xmm3, rdx
+	punpcklqdq xmm3, xmm0
+
+	movdqu	xmm0, XMMWORD PTR [r12+rsi]
+	pxor xmm0, xmm3
+	movdqu	xmm1, XMMWORD PTR [r8+rsi]
+	xor rdx, [r8+rsi]
+	xor rax, [r8+rsi+8]
+	movdqu	xmm3, XMMWORD PTR [rcx+rsi]
+	paddq	xmm0, xmm6
+	paddq	xmm1, xmm11
+	paddq	xmm3, xmm8
+	movdqu	XMMWORD PTR [r8+rsi], xmm0
+	movdqu	XMMWORD PTR [rcx+rsi], xmm1
+	movdqu	XMMWORD PTR [r12+rsi], xmm3
+
+	add	rdi, rdx
+	mov	QWORD PTR [r13], rdi
+	xor	rdi, r10
+	mov	ecx, edi
+	and	ecx, 2097136
+	lea	r8, QWORD PTR [rcx+rsi]
+
+	mov rdx, QWORD PTR [r13+8]	
+	add	rbp, rax
+	mov	QWORD PTR [r13+8], rbp
+	movdqu xmm11, XMMWORD PTR [rcx+rsi]
+	xor	rbp, rdx
+	mov	r13, QWORD PTR [rsp]
+	movdqa	xmm3, xmm7
+	mov	rdx, QWORD PTR [rsp+8]
+	movdqa	xmm8, xmm6
+	mov	r10, QWORD PTR [rsp+256]
+	movdqa	xmm7, xmm9
+	mov	r11, QWORD PTR [rsp+264]
+	movdqa	xmm6, xmm10
+	mov	r9, r15
+	dec r14d
+	jne	main_loop_double_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+272]
+	movaps	xmm13, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+184]
+	movaps	xmm6, XMMWORD PTR [r11-24]
+	movaps	xmm7, XMMWORD PTR [r11-40]
+	movaps	xmm8, XMMWORD PTR [r11-56]
+	movaps	xmm9, XMMWORD PTR [r11-72]
+	movaps	xmm10, XMMWORD PTR [r11-88]
+	movaps	xmm11, XMMWORD PTR [r11-104]
+	movaps	xmm12, XMMWORD PTR [r11-120]
+	movaps	xmm14, XMMWORD PTR [rsp+32]
+	movaps	xmm15, XMMWORD PTR [rsp+16]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_double_mainloop_asm_sandybridge_endp
+
+div_fix_1_sandybridge:
+	dec	rbx
+	add	r11, rdx
+	jmp	div_fix_1_ret_sandybridge
+
+div_fix_2_sandybridge:
+	dec	rdx
+	add	r8, r9
+	jmp	div_fix_2_ret_sandybridge
+
+sqrt_fix_1_sandybridge:
+	movq	r8, xmm3
+	movdqa xmm0, xmm5
+	psrldq xmm0, 8
+	dec	r9
+	mov r11d, -1022
+	shl r11, 32
+	mov	rax, r9
+	shr	r9, 19
+	shr	rax, 20
+	mov	rdx, r9
+	sub	rdx, rax
+	lea	rdx, [rdx+r11+1]
+	add	rax, r11
+	imul	rdx, rax
+	sub	rdx, r8
+	adc	r9, 0
+	movq xmm5, r9
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_1_ret_sandybridge
+
+sqrt_fix_2_sandybridge:
+	psrldq	xmm3, 8
+	movq	r11, xmm3
+	dec	r8
+	mov ebx, -1022
+	shl rbx, 32
+	mov	rax, r8
+	shr	r8, 19
+	shr	rax, 20
+	mov	rdx, r8
+	sub	rdx, rax
+	lea	rdx, [rdx+rbx+1]
+	add	rax, rbx
+	imul	rdx, rax
+	sub	rdx, r11
+	adc	r8, 0
+	movq xmm0, r8
+	punpcklqdq xmm5, xmm0
+	jmp	sqrt_fix_2_ret_sandybridge
+
+cnv2_double_mainloop_asm_sandybridge_endp:
diff --git a/src/crypto/asm/cnv2_main_loop_ivybridge.inc b/src/crypto/asm/cnv2_main_loop_ivybridge.inc
new file mode 100644
index 00000000..35ee0627
--- /dev/null
+++ b/src/crypto/asm/cnv2_main_loop_ivybridge.inc
@@ -0,0 +1,186 @@
+	mov	 QWORD PTR [rsp+24], rbx
+	push	 rbp
+	push	 rsi
+	push	 rdi
+	push	 r12
+	push	 r13
+	push	 r14
+	push	 r15
+	sub	 rsp, 80
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	 rax, QWORD PTR [rcx+48]
+	mov	 r9, rcx
+	xor	 rax, QWORD PTR [rcx+16]
+	mov	 esi, 524288
+	mov	 r8, QWORD PTR [rcx+32]
+	mov	 r13d, -2147483647
+	xor	 r8, QWORD PTR [rcx]
+	mov	 r11, QWORD PTR [rcx+40]
+	mov	 r10, r8
+	mov	 rdx, QWORD PTR [rcx+56]
+	movq	 xmm4, rax
+	xor	 rdx, QWORD PTR [rcx+24]
+	xor	 r11, QWORD PTR [rcx+8]
+	mov	 rbx, QWORD PTR [rcx+224]
+	mov	 rax, QWORD PTR [r9+80]
+	xor	 rax, QWORD PTR [r9+64]
+	movq	 xmm0, rdx
+	mov	 rcx, QWORD PTR [rcx+88]
+	xor	 rcx, QWORD PTR [r9+72]
+	movq	 xmm3, QWORD PTR [r9+104]
+	movaps	 XMMWORD PTR [rsp+64], xmm6
+	movaps	 XMMWORD PTR [rsp+48], xmm7
+	movaps	 XMMWORD PTR [rsp+32], xmm8
+	and	 r10d, 2097136
+	movq	 xmm5, rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+	mov r15, QWORD PTR [r9+96]
+	punpcklqdq xmm4, xmm0
+	movq	 xmm0, rcx
+	punpcklqdq xmm5, xmm0
+	movdqu	 xmm6, XMMWORD PTR [r10+rbx]
+
+        #ifdef __APPLE__
+        ALIGN 16
+        #else
+        ALIGN 64
+        #endif
+$main_loop_ivybridge:
+	lea	 rdx, QWORD PTR [r10+rbx]
+	mov	 ecx, r10d
+	mov	 eax, r10d
+	mov rdi, r15
+	xor	 ecx, 16
+	xor	 eax, 32
+	xor	 r10d, 48
+	movq	 xmm0, r11
+	movq	 xmm7, r8
+	punpcklqdq xmm7, xmm0
+	aesenc	 xmm6, xmm7
+	movq	 rbp, xmm6
+	mov	 r9, rbp
+	and	 r9d, 2097136
+	movdqu	 xmm2, XMMWORD PTR [rcx+rbx]
+	movdqu	 xmm1, XMMWORD PTR [rax+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm1, xmm7
+	paddq	 xmm0, xmm5
+	paddq	 xmm2, xmm4
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm0
+	movdqu	 XMMWORD PTR [rax+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	mov r10, r9
+	xor r10d, 32
+	movq	 rcx, xmm3
+	mov	 rax, rcx
+	shl	 rax, 32
+	xor	 rdi, rax
+	movdqa	 xmm0, xmm6
+	pxor	 xmm0, xmm4
+	movdqu	 XMMWORD PTR [rdx], xmm0
+	xor	 rdi, QWORD PTR [r9+rbx]
+	lea	 r14, QWORD PTR [r9+rbx]
+	mov	 r12, QWORD PTR [r14+8]
+	xor	 edx, edx
+	lea	 r9d, DWORD PTR [ecx+ecx]
+	add	 r9d, ebp
+	movdqa	 xmm0, xmm6
+	psrldq	 xmm0, 8
+	or	 r9d, r13d
+	movq	 rax, xmm0
+	div	 r9
+	xorps xmm3, xmm3
+	mov	 eax, eax
+	shl	 rdx, 32
+	add	 rdx, rax
+	lea	 r9, QWORD PTR [rdx+rbp]
+	mov r15, rdx
+	mov	 rax, r9
+	shr	 rax, 12
+	movq	 xmm0, rax
+	paddq	 xmm0, xmm8
+	sqrtsd	 xmm3, xmm0
+	movq	 rdx, xmm3
+	test	 edx, 524287
+	je	 $sqrt_fixup_ivybridge
+	psrlq	 xmm3, 19
+$sqrt_fixup_ivybridge_ret:
+
+	mov	 ecx, r10d
+	mov	 rax, rdi
+	mul	 rbp
+	movq xmm2, rdx
+	xor rdx, [rcx+rbx]
+	add	 r8, rdx
+	mov	 QWORD PTR [r14], r8
+	xor	 r8, rdi
+	mov edi, r8d
+	and edi, 2097136
+	movq xmm0, rax
+	xor rax, [rcx+rbx+8]
+	add	 r11, rax
+	mov	 QWORD PTR [r14+8], r11
+	punpcklqdq xmm2, xmm0
+
+	mov	 r9d, r10d
+	xor	 r9d, 48
+	xor	 r10d, 16
+	pxor	 xmm2, XMMWORD PTR [r9+rbx]
+	movdqu	 xmm0, XMMWORD PTR [r10+rbx]
+	paddq	 xmm0, xmm5
+	movdqu	 xmm1, XMMWORD PTR [rcx+rbx]
+	paddq	 xmm2, xmm4
+	paddq	 xmm1, xmm7
+	movdqa	 xmm5, xmm4
+	movdqu	 XMMWORD PTR [r9+rbx], xmm0
+	movdqa	 xmm4, xmm6
+	movdqu	 XMMWORD PTR [rcx+rbx], xmm2
+	movdqu	 XMMWORD PTR [r10+rbx], xmm1
+	movdqu xmm6, [rdi+rbx]
+	mov	 r10d, edi
+	xor	 r11, r12
+	dec rsi
+	jne	 $main_loop_ivybridge
+
+	ldmxcsr DWORD PTR [rsp]
+	mov	 rbx, QWORD PTR [rsp+160]
+	movaps	 xmm6, XMMWORD PTR [rsp+64]
+	movaps	 xmm7, XMMWORD PTR [rsp+48]
+	movaps	 xmm8, XMMWORD PTR [rsp+32]
+	add	 rsp, 80
+	pop	 r15
+	pop	 r14
+	pop	 r13
+	pop	 r12
+	pop	 rdi
+	pop	 rsi
+	pop	 rbp
+	jmp $cnv2_main_loop_ivybridge_endp
+
+$sqrt_fixup_ivybridge:
+	dec	 rdx
+	mov r13d, -1022
+	shl r13, 32
+	mov	 rax, rdx
+	shr	 rdx, 19
+	shr	 rax, 20
+	mov	 rcx, rdx
+	sub	 rcx, rax
+	add	 rax, r13
+	not r13
+	sub	 rcx, r13
+	mov	 r13d, -2147483647
+	imul	 rcx, rax
+	sub	 rcx, r9
+	adc	 rdx, 0
+	movq	 xmm3, rdx
+	jmp	 $sqrt_fixup_ivybridge_ret
+
+$cnv2_main_loop_ivybridge_endp:
diff --git a/src/crypto/asm/cnv2_main_loop_ryzen.inc b/src/crypto/asm/cnv2_main_loop_ryzen.inc
new file mode 100644
index 00000000..42054413
--- /dev/null
+++ b/src/crypto/asm/cnv2_main_loop_ryzen.inc
@@ -0,0 +1,183 @@
+	mov	QWORD PTR [rsp+16], rbx
+	mov	QWORD PTR [rsp+24], rbp
+	mov	QWORD PTR [rsp+32], rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 64
+
+	stmxcsr DWORD PTR [rsp]
+	mov DWORD PTR [rsp+4], 24448
+	ldmxcsr DWORD PTR [rsp+4]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r9, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	ebp, 524288
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r11, QWORD PTR [rcx+40]
+	mov	r10, r8
+	mov	rdx, QWORD PTR [rcx+56]
+	movq	xmm3, rax
+	xor	rdx, QWORD PTR [rcx+24]
+	xor	r11, QWORD PTR [rcx+8]
+	mov	rbx, QWORD PTR [rcx+224]
+	mov	rax, QWORD PTR [r9+80]
+	xor	rax, QWORD PTR [r9+64]
+	movq	xmm0, rdx
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r9+72]
+	mov	rdi, QWORD PTR [r9+104]
+	and	r10d, 2097136
+	movaps	XMMWORD PTR [rsp+48], xmm6
+	movq	xmm4, rax
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+16], xmm8
+	xorps	xmm8, xmm8
+	mov ax, 1023
+	shl rax, 52
+	movq xmm7, rax
+	mov	r15, QWORD PTR [r9+96]
+	punpcklqdq xmm3, xmm0
+	movq	xmm0, rcx
+	punpcklqdq xmm4, xmm0
+
+        #ifdef __APPLE__
+        ALIGN 16
+        #else
+        ALIGN 64
+        #endif
+$main_loop_ryzen:
+	movdqa	xmm5, XMMWORD PTR [r10+rbx]
+	movq	xmm0, r11
+	movq	xmm6, r8
+	punpcklqdq xmm6, xmm0
+	lea	rdx, QWORD PTR [r10+rbx]
+	lea	r9, QWORD PTR [rdi+rdi]
+	shl	rdi, 32
+
+	mov	ecx, r10d
+	mov	eax, r10d
+	xor	ecx, 16
+	xor	eax, 32
+	xor	r10d, 48
+	aesenc	xmm5, xmm6
+	movdqa	xmm2, XMMWORD PTR [rcx+rbx]
+	movdqa	xmm1, XMMWORD PTR [rax+rbx]
+	movdqa	xmm0, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	paddq	xmm0, xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm0
+	movdqa	XMMWORD PTR [rax+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movaps	xmm1, xmm8
+	mov	rsi, r15
+	xor	rsi, rdi
+	movq	r14, xmm5
+	movdqa	xmm0, xmm5
+	pxor	xmm0, xmm3
+	mov	r10, r14
+	and	r10d, 2097136
+	movdqa	XMMWORD PTR [rdx], xmm0
+	xor	rsi, QWORD PTR [r10+rbx]
+	lea	r12, QWORD PTR [r10+rbx]
+	mov	r13, QWORD PTR [r10+rbx+8]
+
+	add	r9d, r14d
+	or	r9d, -2147483647
+	xor	edx, edx
+	movdqa	xmm0, xmm5
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+
+	div	r9
+	movq xmm0, rax
+	movq xmm1, rdx
+	punpckldq xmm0, xmm1
+	movq r15, xmm0
+	paddq xmm0, xmm5
+	movdqa xmm2, xmm0
+	psrlq xmm0, 12
+	paddq	xmm0, xmm7
+	sqrtsd	xmm1, xmm0
+	movq	rdi, xmm1
+	test	rdi, 524287
+	je	$sqrt_fixup_ryzen
+	shr	rdi, 19
+
+$sqrt_fixup_ryzen_ret:
+	mov	rax, rsi
+	mul	r14
+	movq xmm1, rax
+	movq xmm0, rdx
+	punpcklqdq xmm0, xmm1
+
+	mov	r9d, r10d
+	mov	ecx, r10d
+	xor	r9d, 16
+	xor	ecx, 32
+	xor	r10d, 48
+	movdqa	xmm1, XMMWORD PTR [rcx+rbx]
+	xor rdx, [rcx+rbx]
+	xor rax, [rcx+rbx+8]
+	movdqa	xmm2, XMMWORD PTR [r9+rbx]
+	pxor xmm2, xmm0
+	paddq xmm4, XMMWORD PTR [r10+rbx]
+	paddq	xmm2, xmm3
+	paddq	xmm1, xmm6
+	movdqa	XMMWORD PTR [r9+rbx], xmm4
+	movdqa	XMMWORD PTR [rcx+rbx], xmm2
+	movdqa	XMMWORD PTR [r10+rbx], xmm1
+
+	movdqa	xmm4, xmm3
+	add	r8, rdx
+	add	r11, rax
+	mov	QWORD PTR [r12], r8
+	xor	r8, rsi
+	mov	QWORD PTR [r12+8], r11
+	mov	r10, r8
+	xor	r11, r13
+	and	r10d, 2097136
+	movdqa	xmm3, xmm5
+	dec	ebp
+	jne	$main_loop_ryzen
+
+	ldmxcsr DWORD PTR [rsp]
+	movaps	xmm6, XMMWORD PTR [rsp+48]
+	lea	r11, QWORD PTR [rsp+64]
+	mov	rbx, QWORD PTR [r11+56]
+	mov	rbp, QWORD PTR [r11+64]
+	mov	rsi, QWORD PTR [r11+72]
+	movaps	xmm8, XMMWORD PTR [r11-48]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	mov	rsp, r11
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	jmp $cnv2_main_loop_ryzen_endp
+
+$sqrt_fixup_ryzen:
+	movq r9, xmm2
+	dec	rdi
+	mov edx, -1022
+	shl rdx, 32
+	mov	rax, rdi
+	shr	rdi, 19
+	shr	rax, 20
+	mov	rcx, rdi
+	sub	rcx, rax
+	lea	rcx, [rcx+rdx+1]
+	add	rax, rdx
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdi, 0
+	jmp	$sqrt_fixup_ryzen_ret
+
+$cnv2_main_loop_ryzen_endp:
diff --git a/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc b/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc
new file mode 100644
index 00000000..bc3da761
--- /dev/null
+++ b/src/crypto/asm/cnv2_mainloop_soft_aes_sandybridge.inc
@@ -0,0 +1,271 @@
+	mov	QWORD PTR [rsp+8], rcx
+	push	rbx
+	push	rbp
+	push	rsi
+	push	rdi
+	push	r12
+	push	r13
+	push	r14
+	push	r15
+	sub	rsp, 152
+
+	stmxcsr DWORD PTR [rsp+4]
+	mov DWORD PTR [rsp], 24448
+	ldmxcsr DWORD PTR [rsp]
+
+	mov	rax, QWORD PTR [rcx+48]
+	mov	r10, rcx
+	xor	rax, QWORD PTR [rcx+16]
+	mov	r8, QWORD PTR [rcx+32]
+	xor	r8, QWORD PTR [rcx]
+	mov	r9, QWORD PTR [rcx+40]
+	xor	r9, QWORD PTR [rcx+8]
+	movq	xmm4, rax
+	mov	rdx, QWORD PTR [rcx+56]
+	xor	rdx, QWORD PTR [rcx+24]
+	mov	r11, QWORD PTR [rcx+224]
+	mov	rcx, QWORD PTR [rcx+88]
+	xor	rcx, QWORD PTR [r10+72]
+	mov	rax, QWORD PTR [r10+80]
+	movq	xmm0, rdx
+	xor	rax, QWORD PTR [r10+64]
+
+	movaps	XMMWORD PTR [rsp+16], xmm6
+	movaps	XMMWORD PTR [rsp+32], xmm7
+	movaps	XMMWORD PTR [rsp+48], xmm8
+	movaps	XMMWORD PTR [rsp+64], xmm9
+	movaps	XMMWORD PTR [rsp+80], xmm10
+	movaps	XMMWORD PTR [rsp+96], xmm11
+	movaps	XMMWORD PTR [rsp+112], xmm12
+	movaps	XMMWORD PTR [rsp+128], xmm13
+
+	movq	xmm5, rax
+
+	mov ax, 1023
+	shl rax, 52
+	movq xmm8, rax
+
+	mov	rax, r8
+	punpcklqdq xmm4, xmm0
+	and	eax, 2097136
+	movq	xmm10, QWORD PTR [r10+96]
+	movq	xmm0, rcx
+	mov	rcx, QWORD PTR [r10+104]
+	xorps	xmm9, xmm9
+	mov	QWORD PTR [rsp+248], rax
+	movq	xmm12, r11
+	mov	QWORD PTR [rsp+240], r9
+	punpcklqdq xmm5, xmm0
+	movq xmm13, rcx
+	mov r12d, 524288
+
+        #ifdef __APPLE__
+        ALIGN 16
+        #else
+        ALIGN 64
+        #endif
+cnv2_mainloop_soft_aes_sandybridge:
+	movd xmm11, r12d
+	mov	r12, QWORD PTR [r10+272]
+	lea	r13, QWORD PTR [rax+r11]
+	mov	esi, DWORD PTR [r13]
+	movq	xmm0, r9
+	mov	r10d, DWORD PTR [r13+4]
+	movq	xmm7, r8
+	mov	ebp, DWORD PTR [r13+12]
+	mov	r14d, DWORD PTR [r13+8]
+	mov	rdx, QWORD PTR [rsp+248]
+	movzx	ecx, sil
+	shr	esi, 8
+	punpcklqdq xmm7, xmm0
+	mov	r15d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	mov	edi, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	ebx, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	shr	ebp, 8
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, r10b
+	shr	r10d, 8
+	xor	r15d, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, r14b
+	shr	r14d, 8
+	mov	eax, r14d
+	shr	eax, 8
+	xor	edi, DWORD PTR [r12+rcx*4+1024]
+	add	eax, 256
+	movzx	ecx, bpl
+	shr	ebp, 8
+	xor	ebx, DWORD PTR [r12+rcx*4+1024]
+	movzx	ecx, sil
+	shr	esi, 8
+	xor	r9d, DWORD PTR [r12+rcx*4+1024]
+	add	r12, 2048
+	movzx	ecx, r10b
+	shr	r10d, 8
+	add	r10d, 256
+	mov	r11d, DWORD PTR [r12+rax*4]
+	xor	r11d, DWORD PTR [r12+rcx*4]
+	xor	r11d, r9d
+	movzx	ecx, sil
+	mov	r10d, DWORD PTR [r12+r10*4]
+	shr	esi, 8
+	add	esi, 256
+	xor	r10d, DWORD PTR [r12+rcx*4]
+	movzx	ecx, bpl
+	xor	r10d, ebx
+	shr	ebp, 8
+	movd	xmm1, r11d
+	add	ebp, 256
+	movq	r11, xmm12
+	mov	r9d, DWORD PTR [r12+rcx*4]
+	xor	r9d, DWORD PTR [r12+rsi*4]
+	mov	eax, DWORD PTR [r12+rbp*4]
+	xor	r9d, edi
+	movzx	ecx, r14b
+	movd	xmm0, r10d
+	movd	xmm2, r9d
+	xor	eax, DWORD PTR [r12+rcx*4]
+	mov	rcx, rdx
+	xor	eax, r15d
+	punpckldq xmm2, xmm1
+	xor	rcx, 16
+	movd	xmm6, eax
+	mov	rax, rdx
+	punpckldq xmm6, xmm0
+	xor	rax, 32
+	punpckldq xmm6, xmm2
+	xor	rdx, 48
+	movdqu	xmm2, XMMWORD PTR [rcx+r11]
+	pxor	xmm6, xmm7
+	paddq	xmm2, xmm4
+	movdqu	xmm1, XMMWORD PTR [rax+r11]
+	movdqu	xmm0, XMMWORD PTR [rdx+r11]
+	paddq	xmm0, xmm5
+	movdqu	XMMWORD PTR [rcx+r11], xmm0
+	movdqu	XMMWORD PTR [rax+r11], xmm2
+	movq rcx, xmm13
+	paddq	xmm1, xmm7
+	movdqu	XMMWORD PTR [rdx+r11], xmm1
+	movq	rdi, xmm6
+	mov	r10, rdi
+	and	r10d, 2097136
+	xor	edx, edx
+	mov	rax, rcx
+	shl	rax, 32
+	movq	rbx, xmm10
+	xor	rbx, rax
+	lea	r9, QWORD PTR [rcx+rcx]
+	add	r9d, edi
+	movdqa	xmm0, xmm6
+	pxor	xmm0, xmm4
+	mov	ecx, -2147483647
+	movdqu	XMMWORD PTR [r13], xmm0
+	or	r9, rcx
+	movdqa	xmm0, xmm6
+	movaps	xmm1, xmm9
+	psrldq	xmm0, 8
+	movq	rax, xmm0
+	xor	rbx, QWORD PTR [r10+r11]
+	lea	r14, QWORD PTR [r10+r11]
+	mov	rbp, QWORD PTR [r14+8]
+	div	r9
+	shl	rdx, 32
+	mov	eax, eax
+	add	rdx, rax
+	lea	r9, QWORD PTR [rdx+rdi]
+	movq	xmm10, rdx
+	mov	rax, r9
+	shr	rax, 12
+	movq	xmm0, rax
+	paddq	xmm0, xmm8
+	sqrtsd	xmm1, xmm0
+	movq	rdx, xmm1
+	test	rdx, 524287
+	je	sqrt_fixup_soft_aes_sandybridge
+	psrlq xmm1, 19
+sqrt_fixup_soft_aes_sandybridge_ret:
+
+	mov	r9, r10
+	movdqa	xmm13, xmm1
+	xor	r9, 16
+	mov	rcx, r10
+	xor	rcx, 32
+	xor	r10, 48
+	mov	rax, rbx
+	mul	rdi
+	movdqu	xmm2, XMMWORD PTR [r9+r11]
+	movdqu	xmm1, XMMWORD PTR [rcx+r11]
+	paddq	xmm1, xmm7
+	movq	xmm0, rax
+	movq	xmm3, rdx
+	xor	rax, QWORD PTR [r11+rcx+8]
+	xor	rdx, QWORD PTR [rcx+r11]
+	punpcklqdq xmm3, xmm0
+	add	r8, rdx
+	movdqu	xmm0, XMMWORD PTR [r10+r11]
+	pxor	xmm2, xmm3
+	paddq	xmm0, xmm5
+	paddq	xmm2, xmm4
+	movdqu	XMMWORD PTR [r9+r11], xmm0
+	movdqa	xmm5, xmm4
+	mov	r9, QWORD PTR [rsp+240]
+	movdqa	xmm4, xmm6
+	add	r9, rax
+	movdqu	XMMWORD PTR [rcx+r11], xmm2
+	movdqu	XMMWORD PTR [r10+r11], xmm1
+	mov	r10, QWORD PTR [rsp+224]
+	movd r12d, xmm11
+	mov	QWORD PTR [r14], r8
+	xor	r8, rbx
+	mov	rax, r8
+	mov	QWORD PTR [r14+8], r9
+	and	eax, 2097136
+	xor	r9, rbp
+	mov	QWORD PTR [rsp+240], r9
+	mov	QWORD PTR [rsp+248], rax
+	sub	r12d, 1
+	jne	cnv2_mainloop_soft_aes_sandybridge
+
+	ldmxcsr DWORD PTR [rsp+4]
+	movaps	xmm6, XMMWORD PTR [rsp+16]
+	movaps	xmm7, XMMWORD PTR [rsp+32]
+	movaps	xmm8, XMMWORD PTR [rsp+48]
+	movaps	xmm9, XMMWORD PTR [rsp+64]
+	movaps	xmm10, XMMWORD PTR [rsp+80]
+	movaps	xmm11, XMMWORD PTR [rsp+96]
+	movaps	xmm12, XMMWORD PTR [rsp+112]
+	movaps	xmm13, XMMWORD PTR [rsp+128]
+
+	add	rsp, 152
+	pop	r15
+	pop	r14
+	pop	r13
+	pop	r12
+	pop	rdi
+	pop	rsi
+	pop	rbp
+	pop	rbx
+	jmp cnv2_mainloop_soft_aes_sandybridge_asm_endp
+
+sqrt_fixup_soft_aes_sandybridge:
+	dec	rdx
+	mov r15d, -1022
+	shl r15, 32
+	mov	rax, rdx
+	shr	rdx, 19
+	shr	rax, 20
+	mov	rcx, rdx
+	sub	rcx, rax
+	lea	rcx, [rcx+r15+1]
+	add	rax, r15
+	imul	rcx, rax
+	sub	rcx, r9
+	adc	rdx, 0
+	movq xmm1, rdx
+	jmp	sqrt_fixup_soft_aes_sandybridge_ret
+
+cnv2_mainloop_soft_aes_sandybridge_asm_endp:
diff --git a/src/default_config.json b/src/default_config.json
index d3081d1b..ec5fcecb 100644
--- a/src/default_config.json
+++ b/src/default_config.json
@@ -4,8 +4,9 @@
     "threads": 0,                               // number of miner threads (not set or 0 enables automatic selection of optimal thread count)
     "multihash-factor": 0,                      // number of hash blocks to process at a time (not set or 0 enables automatic selection of optimal number of hash blocks)
     "multihash-thread-mask" : null,             // for multihash-factors>0 only, limits multihash to given threads (mask), mask "0x3" means run multihash on thread 0 and 1 only (default: all threads)
-    "pow-variant" : "auto",                     // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto
+    "pow-variant" : "auto",                     // specificy the PoW variat to use: -> auto (default), 0 (v0), 1 (v1, aka monerov7, aeonv7), 2 (v2, aka monerov8), tube (ipbc), alloy, xtl (including autodetect for v5), msr, xhv, rto
     // for further help see: https://github.com/Bendr0id/xmrigCC/wiki/Coin-configurations
+    "asm-optimization" : "auto",                // specificy the ASM optimization to use: -> auto (default), intel, ryzen, none
     "background": false,                        // true to run the miner in the background (Windows only, for *nix plase use screen/tmux or systemd service instead)
     "colors": true,                             // false to disable colored output
     "cpu-affinity": null,                       // set process affinity to CPU core(s), mask "0x3" for cores 0 and 1
diff --git a/src/interfaces/ILogBackend.h b/src/interfaces/ILogBackend.h
index 458b504c..31d264de 100644
--- a/src/interfaces/ILogBackend.h
+++ b/src/interfaces/ILogBackend.h
@@ -31,6 +31,12 @@
 class ILogBackend
 {
 public:
+#   ifdef APP_DEBUG
+    constexpr static const size_t kBufferSize = 1024;
+#   else
+    constexpr static const size_t kBufferSize = 512;
+#   endif
+
     virtual ~ILogBackend() {}
 
     virtual void message(int level, const char* fmt, va_list args) = 0;
diff --git a/src/log/FileLog.cpp b/src/log/FileLog.cpp
index 7d04e574..8450a51b 100644
--- a/src/log/FileLog.cpp
+++ b/src/log/FileLog.cpp
@@ -56,19 +56,20 @@ void FileLog::message(int level, const char* fmt, va_list args)
     localtime_r(&now, &stime);
 #   endif
 
-    auto *buf = new char[512];
-    int size = snprintf(buf, 23, "[%d-%02d-%02d %02d:%02d:%02d] ",
-                        stime.tm_year + 1900,
-                        stime.tm_mon + 1,
-                        stime.tm_mday,
-                        stime.tm_hour,
-                        stime.tm_min,
-                        stime.tm_sec);
+    snprintf(m_fmt, sizeof(m_fmt) - 1, "[%d-%02d-%02d %02d:%02d:%02d] %s",
+             stime.tm_year + 1900,
+             stime.tm_mon + 1,
+             stime.tm_mday,
+             stime.tm_hour,
+             stime.tm_min,
+             stime.tm_sec,
+             fmt);
 
-    size = vsnprintf(buf + size, 512 - size - 1, fmt, args) + size;
+    auto *buf = new char[kBufferSize];
+    const int size = vsnprintf(buf, kBufferSize - 1, m_fmt, args);
     buf[size] = '\n';
 
-    std::string row = std::regex_replace(std::string(buf, size+1), std::regex("\x1B\\[[0-9;]*[a-zA-Z]"), "");
+    std::string row = std::regex_replace(std::string(buf, static_cast<unsigned long>(size + 1)), std::regex("\x1B\\[[0-9;]*[a-zA-Z]"), "");
 
     memcpy(buf, row.c_str(), row.length());
 
diff --git a/src/log/FileLog.h b/src/log/FileLog.h
index 2b3ca5d4..469347d4 100644
--- a/src/log/FileLog.h
+++ b/src/log/FileLog.h
@@ -44,6 +44,7 @@ private:
 
     void write(char *data, size_t size);
 
+    char m_fmt[256];
     int m_file;
 };
 
diff --git a/src/net/Job.cpp b/src/net/Job.cpp
index 8de5034d..c9253489 100644
--- a/src/net/Job.cpp
+++ b/src/net/Job.cpp
@@ -138,24 +138,24 @@ bool Job::setTarget(const char *target)
 
 PowVariant Job::powVariant() const
 {
-    if (m_powVariant == PowVariant::POW_AUTODETECT)
-    {
-        return (m_blob[0] > 6 ? PowVariant::POW_V1 : PowVariant::POW_V0);
+    if (m_powVariant == PowVariant::POW_AUTODETECT) {
+        if (m_blob[0] > 7) {
+            return PowVariant::POW_V2;
+        } else if (m_blob[0] > 6) {
+            return PowVariant::POW_V1;
+        } else {
+            return PowVariant::POW_V0;
+        }
     }
-    else if (m_powVariant == PowVariant::POW_XTL && m_blob[0] < 4)
-    {
+    else if (m_powVariant == PowVariant::POW_XTL && m_blob[0] < 4) {
         return POW_V1;
     }
-    else if (m_powVariant == PowVariant::POW_MSR && m_blob[0] < 7)
-    {
+    else if (m_powVariant == PowVariant::POW_MSR && m_blob[0] < 7) {
         return POW_V1;
     }
-    else if (m_powVariant == PowVariant::POW_XHV && m_blob[0] < 3)
-    {
+    else if (m_powVariant == PowVariant::POW_XHV && m_blob[0] < 3) {
         return POW_V0;
-    }
-    else
-    {
+    } else {
         return m_powVariant;
     }
 }
diff --git a/src/version.h b/src/version.h
index f8ce56e2..a8b6849b 100644
--- a/src/version.h
+++ b/src/version.h
@@ -36,13 +36,13 @@
 #define APP_DESC      "XMRigCC CPU miner"
 #define APP_COPYRIGHT "Copyright (C) 2017- BenDr0id"
 #endif
-#define APP_VERSION   "1.7.0 (based on XMRig)"
+#define APP_VERSION   "1.8.0_beta1 (based on XMRig)"
 #define APP_DOMAIN    ""
 #define APP_SITE      "https://github.com/Bendr0id/xmrigCC"
 #define APP_KIND      "cpu"
 
 #define APP_VER_MAJOR  1
-#define APP_VER_MINOR  7
+#define APP_VER_MINOR  8
 #define APP_VER_BUILD  0
 #define APP_VER_REV    0
 
diff --git a/src/workers/MultiWorker.cpp b/src/workers/MultiWorker.cpp
index e599b87f..15389fbb 100644
--- a/src/workers/MultiWorker.cpp
+++ b/src/workers/MultiWorker.cpp
@@ -140,7 +140,7 @@ void MultiWorker::start()
                 *Job::nonce(m_state->blob + i * m_state->job.size()) = ++m_state->nonces[i];
             }
 
-            CryptoNight::hash(m_hashFactor, m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads);
+            CryptoNight::hash(m_hashFactor, Options::i()->asmOptimization(), m_state->job.powVariant(), m_state->blob, m_state->job.size(), m_hash, scratchPads);
 
             for (size_t i=0; i < m_hashFactor; ++i) {
                 if (*reinterpret_cast<uint64_t *>(m_hash + 24 + i * 32) < m_state->job.target()) {